Skip to content

Commit 2617c55

Browse files
authored
Merge pull request vllm-project#8 from ri938/organise
Organise
2 parents aaea899 + 878a370 commit 2617c55

File tree

8 files changed

+7
-6
lines changed

8 files changed

+7
-6
lines changed

vllm/awq_quantization/kernels/csrc/pybind.cpp renamed to awq_ext/awq_kernels/pybind.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#include <pybind11/pybind11.h>
44
#include <torch/extension.h>
5-
#include "quantization/gemm_cuda.h"
5+
#include "gemm_cuda.h"
66

77
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
88
{

vllm/awq_quantization/kernels/setup.py renamed to awq_ext/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
CUDAExtension(
1616
name="awq_inference_engine",
1717
sources=[
18-
"csrc/pybind.cpp",
19-
"csrc/quantization/gemm_cuda_gen.cu",
18+
"awq_kernels/pybind.cpp",
19+
"awq_kernels/gemm_cuda_gen.cu",
2020
],
2121
extra_compile_args=extra_compile_args,
2222
),

vllm/awq_quantization/__init__.py

Whitespace-only changes.

vllm/awq_quantization/qmodule.py renamed to vllm/model_executor/layers/quant.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import torch
44
import torch.nn as nn
55

6+
67
try:
78
import awq_inference_engine # with CUDA kernels
89
except ImportError as ex:
@@ -21,7 +22,7 @@ def forward(self, x):
2122
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
2223

2324

24-
class WQLinear(nn.Module):
25+
class AWQLinear(nn.Module):
2526
def __init__(
2627
self,
2728
w_bit,

vllm/model_executor/models/llama.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@
3737
from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
3838
from vllm.model_executor.layers.layernorm import RMSNorm
3939
from vllm.model_executor.layers.sampler import Sampler
40+
from vllm.model_executor.layers import quant
4041
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
4142
load_tensor_parallel_weights)
4243
from vllm.model_executor.parallel_utils.parallel_state import (
4344
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
4445
from vllm.model_executor.parallel_utils.tensor_parallel import (
4546
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
4647
from vllm.sequence import SequenceOutputs
47-
from vllm.awq_quantization import qmodule
4848

4949
KVCache = Tuple[torch.Tensor, torch.Tensor]
5050

@@ -141,7 +141,7 @@ def forward(
141141

142142

143143
def get_quantized_layer(in_features, out_features, quant_config):
144-
layer = qmodule.WQLinear(
144+
layer = quant.AWQLinear(
145145
w_bit=quant_config.bits,
146146
group_size=quant_config.group_size,
147147
in_features=in_features,

0 commit comments

Comments
 (0)