v1nc3nt27
diff --git a/‎vllm/awq_quantization/kernels/csrc/quantization/dequantize.cuh‎ renamed to ‎awq_ext/awq_kernels/dequantize.cuh‎ b/‎vllm/awq_quantization/kernels/csrc/quantization/dequantize.cuh‎ renamed to ‎awq_ext/awq_kernels/dequantize.cuh‎
diff --git a/‎vllm/awq_quantization/kernels/csrc/quantization/gemm_cuda.h‎ renamed to ‎awq_ext/awq_kernels/gemm_cuda.h‎ b/‎vllm/awq_quantization/kernels/csrc/quantization/gemm_cuda.h‎ renamed to ‎awq_ext/awq_kernels/gemm_cuda.h‎
diff --git a/‎vllm/awq_quantization/kernels/csrc/quantization/gemm_cuda_gen.cu‎ renamed to ‎awq_ext/awq_kernels/gemm_cuda_gen.cu‎ b/‎vllm/awq_quantization/kernels/csrc/quantization/gemm_cuda_gen.cu‎ renamed to ‎awq_ext/awq_kernels/gemm_cuda_gen.cu‎
diff --git a/‎vllm/awq_quantization/kernels/csrc/pybind.cpp‎ renamed to ‎awq_ext/awq_kernels/pybind.cpp‎
Lines changed: 1 addition & 1 deletion b/‎vllm/awq_quantization/kernels/csrc/pybind.cpp‎ renamed to ‎awq_ext/awq_kernels/pybind.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/awq_quantization/kernels/setup.py‎ renamed to ‎awq_ext/setup.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/awq_quantization/kernels/setup.py‎ renamed to ‎awq_ext/setup.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/awq_quantization/__init__.py‎ b/‎vllm/awq_quantization/__init__.py‎
diff --git a/‎vllm/awq_quantization/qmodule.py‎ renamed to ‎vllm/model_executor/layers/quant.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm/awq_quantization/qmodule.py‎ renamed to ‎vllm/model_executor/layers/quant.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/llama.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/models/llama.py‎
Lines changed: 2 additions & 2 deletions
@@ -2,7 +2,7 @@
 
 #include <pybind11/pybind11.h>
 #include <torch/extension.h>
-#include "quantization/gemm_cuda.h"
+#include "gemm_cuda.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
 
@@ -15,8 +15,8 @@
         CUDAExtension(
             name="awq_inference_engine",
             sources=[
-                "csrc/pybind.cpp", 
-                "csrc/quantization/gemm_cuda_gen.cu",
+                "awq_kernels/pybind.cpp", 
+                "awq_kernels/gemm_cuda_gen.cu",
             ],
             extra_compile_args=extra_compile_args,
         ),
 
@@ -3,6 +3,7 @@
 import torch
 import torch.nn as nn
 
+
 try:
     import awq_inference_engine  # with CUDA kernels
 except ImportError as ex:
@@ -21,7 +22,7 @@ def forward(self, x):
         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
 
 
-class WQLinear(nn.Module):
+class AWQLinear(nn.Module):
     def __init__(
             self,
             w_bit,
 
@@ -37,14 +37,14 @@
 from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers import quant
 from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
                                               load_tensor_parallel_weights)
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.parallel_utils.tensor_parallel import (
     VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
 from vllm.sequence import SequenceOutputs
-from vllm.awq_quantization import qmodule
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -141,7 +141,7 @@ def forward(
 
 
 def get_quantized_layer(in_features, out_features, quant_config):
-    layer = qmodule.WQLinear(
+    layer = quant.AWQLinear(
         w_bit=quant_config.bits,
         group_size=quant_config.group_size,
         in_features=in_features,
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`
`3`	`3`	`#include <pybind11/pybind11.h>`
`4`	`4`	`#include <torch/extension.h>`
`5`		`-#include "quantization/gemm_cuda.h"`
	`5`	`+#include "gemm_cuda.h"`
`6`	`6`
`7`	`7`	`PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)`
`8`	`8`	`{`