@@ -1872,14 +1872,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1872
1872
v.y = x[ib + iqs + 1 ];
1873
1873
}
1874
1874
1875
- static __device__ void convert_f32 (const void * vx, const int ib, const int iqs, dfloat2 & v){
1876
- const float * x = (const float *) vx;
1877
-
1878
- // automatic half -> float type cast if dfloat == float
1879
- v.x = x[ib + iqs + 0 ];
1880
- v.y = x[ib + iqs + 1 ];
1881
- }
1882
-
1883
1875
static __global__ void quantize_q8_1 (const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1884
1876
const int ix = blockDim .x *blockIdx .x + threadIdx .x ;
1885
1877
@@ -1983,7 +1975,7 @@ static __global__ void k_get_rows_float(
1983
1975
1984
1976
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t >
1985
1977
static __global__ void dequantize_block (const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1986
- const int i = blockDim .x *blockIdx .x + 2 * threadIdx .x ;
1978
+ const int i = 2 *( blockDim .x *blockIdx .x + threadIdx .x ) ;
1987
1979
1988
1980
if (i >= k) {
1989
1981
return ;
@@ -2002,6 +1994,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
2002
1994
y[iybs + iqs + y_offset] = v.y ;
2003
1995
}
2004
1996
1997
+ template <typename src_t , typename dst_t >
1998
+ static __global__ void convert_unary (const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1999
+ const int i = blockDim .x *blockIdx .x + threadIdx .x ;
2000
+
2001
+ if (i >= k) {
2002
+ return ;
2003
+ }
2004
+
2005
+ const src_t * x = (src_t *) vx;
2006
+
2007
+ y[i] = x[i];
2008
+ }
2009
+
2005
2010
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2006
2011
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2007
2012
@@ -5609,7 +5614,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
5609
5614
5610
5615
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t >
5611
5616
static void dequantize_block_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
5612
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5617
+ const int num_blocks = (k + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / ( 2 * CUDA_DEQUANTIZE_BLOCK_SIZE) ;
5613
5618
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
5614
5619
}
5615
5620
@@ -5659,6 +5664,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
5659
5664
#endif
5660
5665
}
5661
5666
5667
+ template <typename src_t , typename dst_t >
5668
+ static void convert_unary_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
5669
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5670
+ convert_unary<src_t ><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
5671
+ }
5672
+
5662
5673
static to_fp16_cuda_t ggml_get_to_fp16_cuda (ggml_type type) {
5663
5674
switch (type) {
5664
5675
case GGML_TYPE_Q4_0:
@@ -5682,7 +5693,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5682
5693
case GGML_TYPE_Q6_K:
5683
5694
return dequantize_row_q6_K_cuda;
5684
5695
case GGML_TYPE_F32:
5685
- return dequantize_block_cuda< 1 , 1 , convert_f32 >;
5696
+ return convert_unary_cuda< float >;
5686
5697
default :
5687
5698
return nullptr ;
5688
5699
}
@@ -5711,7 +5722,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5711
5722
case GGML_TYPE_Q6_K:
5712
5723
return dequantize_row_q6_K_cuda;
5713
5724
case GGML_TYPE_F16:
5714
- return dequantize_block_cuda< 1 , 1 , convert_f16 >;
5725
+ return convert_unary_cuda<half >;
5715
5726
default :
5716
5727
return nullptr ;
5717
5728
}
0 commit comments