diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index f3b545670b88..25665517fee2 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -185,11 +185,6 @@ def mixtral_lora_files(): return snapshot_download(repo_id="SangBinCho/mixtral-lora") -@pytest.fixture(scope="session") -def mixtral_lora_files_all_target_modules(): - return snapshot_download(repo_id="dyang415/mixtral-lora-v0") - - @pytest.fixture(scope="session") def gemma_lora_files(): return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 5d59df365fe1..aea7691935df 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -69,45 +69,3 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): prompts=prompts) == expected_lora_output assert do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts) == expected_lora_output - - -@pytest.mark.parametrize("tp_size", [4]) -@pytest.mark.parametrize("fully_shard", [True, False]) -def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, - tp_size, fully_shard): - """This LoRA model has all supported Mixtral target modules""" - - if torch.cuda.device_count() < tp_size: - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - - prompts = [ - "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:", # noqa: E501 - "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:", # noqa: E501 - "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:", # noqa: E501 - ] - - llm = vllm.LLM( - MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - distributed_executor_backend="ray", - tensor_parallel_size=tp_size, - fully_sharded_loras=fully_shard, - max_lora_rank=32, - ) - - expected_lora_output = [ - "A: Nothing happens if you touch the eyes of a blind man.", - "A: add heat", - "1: Craig", - ] - - assert do_sample(llm, - mixtral_lora_files_all_target_modules, - lora_id=1, - prompts=prompts) == expected_lora_output - assert do_sample(llm, - mixtral_lora_files_all_target_modules, - lora_id=2, - prompts=prompts) == expected_lora_output diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index b4f3d8dc478a..d607bf66ebd4 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -178,7 +178,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model): if num_gpus_available < 2: pytest.skip(f"Not enough GPUs for tensor parallelism {2}") - + if model.quantization == "GPTQ": + pytest.skip("GPTQ lora outputs are just incredibly unstable") llm_tp1 = vllm.LLM( model=model.model_path, enable_lora=True,