Skip to content

Commit fb2b9ea

Browse files
committed
Merge branch 'master' into pr/8836
2 parents 3a027b8 + e11bd85 commit fb2b9ea

23 files changed

+1656
-778
lines changed

CMakePresets.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
2929
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
3030
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
31+
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
3132

3233
{
3334
"name": "arm64-windows-msvc", "hidden": true,
@@ -60,6 +61,8 @@
6061
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
6162

6263
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
63-
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
64+
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
65+
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
66+
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
6467
]
6568
}

convert_hf_to_gguf.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,15 @@ class Model:
6363
model_name: str | None
6464
metadata_override: Path | None
6565
dir_model_card: Path
66+
is_lora: bool
6667

6768
# subclasses should define this!
6869
model_arch: gguf.MODEL_ARCH
6970

7071
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
7172
use_temp_file: bool = False, eager: bool = False,
7273
metadata_override: Path | None = None, model_name: str | None = None,
73-
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
74+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
7475
if type(self) is Model:
7576
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
7677

@@ -92,6 +93,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
9293
self.metadata_override = metadata_override
9394
self.model_name = model_name
9495
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
96+
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
9597

9698
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
9799
if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -1593,7 +1595,8 @@ def prepare_tensors(self):
15931595
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
15941596
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
15951597

1596-
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1598+
if not self.is_lora:
1599+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
15971600

15981601
super().prepare_tensors()
15991602

@@ -2140,8 +2143,9 @@ def set_gguf_parameters(self):
21402143
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
21412144
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
21422145

2143-
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2144-
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2146+
if not self.is_lora:
2147+
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2148+
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
21452149

21462150

21472151
@Model.register("PlamoForCausalLM")
@@ -3839,7 +3843,8 @@ def prepare_tensors(self):
38393843
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
38403844
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
38413845

3842-
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
3846+
if not self.is_lora:
3847+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
38433848

38443849
super().prepare_tensors()
38453850

convert_lora_to_gguf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
386386
dry_run=args.dry_run,
387387
dir_lora_model=dir_lora,
388388
lora_alpha=alpha,
389+
is_lora=True,
389390
)
390391

391392
logger.info("Exporting model...")

docs/backend/SYCL.md

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,14 @@
2020
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
2121

2222
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
23-
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
23+
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
2424
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
2525
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
2626

2727
### Llama.cpp + SYCL
2828

2929
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
3030

31-
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
32-
33-
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
34-
3531
## Recommended Release
3632

3733
The SYCL backend would be broken by some PRs due to no online CI.
@@ -45,6 +41,10 @@ The following release is verified with good quality:
4541

4642
## News
4743

44+
45+
- 2024.8
46+
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
47+
4848
- 2024.5
4949
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
5050
- Arch Linux is verified successfully.
@@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
196196

197197
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
198198

199-
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
199+
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
200200

201201
- **Adding support to Nvidia GPUs**
202202

@@ -255,8 +255,6 @@ or
255255
# Export relevant ENV variables
256256
source /opt/intel/oneapi/setvars.sh
257257

258-
# Build LLAMA with MKL BLAS acceleration for intel GPU
259-
260258
# Option 1: Use FP32 (recommended for better performance in most cases)
261259
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
262260

examples/quantize/quantize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ static void usage(const char * executable) {
111111
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
112112
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
113113
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
114-
printf(" --keep-split: will generate quatized model in the same shards as input");
114+
printf(" --keep-split: will generate quantized model in the same shards as input\n");
115115
printf(" --override-kv KEY=TYPE:VALUE\n");
116116
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
117117
printf("Note: --include-weights and --exclude-weights cannot be used together\n");

ggml/include/ggml.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,7 +1760,8 @@ extern "C" {
17601760
struct ggml_tensor * v,
17611761
struct ggml_tensor * mask,
17621762
float scale,
1763-
float max_bias);
1763+
float max_bias,
1764+
float logit_softcap);
17641765

17651766
GGML_API void ggml_flash_attn_ext_set_prec(
17661767
struct ggml_tensor * a,
@@ -1777,10 +1778,8 @@ extern "C" {
17771778

17781779
GGML_API struct ggml_tensor * ggml_ssm_conv(
17791780
struct ggml_context * ctx,
1780-
struct ggml_tensor * s,
1781-
struct ggml_tensor * x,
1782-
struct ggml_tensor * c,
1783-
struct ggml_tensor * sq);
1781+
struct ggml_tensor * sx,
1782+
struct ggml_tensor * c);
17841783

17851784
GGML_API struct ggml_tensor * ggml_ssm_scan(
17861785
struct ggml_context * ctx,
@@ -1789,8 +1788,7 @@ extern "C" {
17891788
struct ggml_tensor * dt,
17901789
struct ggml_tensor * A,
17911790
struct ggml_tensor * B,
1792-
struct ggml_tensor * C,
1793-
struct ggml_tensor * sq);
1791+
struct ggml_tensor * C);
17941792

17951793
// partition into non-overlapping windows with padding if needed
17961794
// example:

ggml/src/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,13 @@ if (GGML_SYCL)
549549
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
550550
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
551551

552+
find_package(DNNL)
553+
message("-- DNNL found:" ${DNNL_FOUND})
554+
if (GGML_SYCL_TARGET STREQUAL "INTEL")
555+
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
556+
else()
557+
add_compile_definitions(GGML_SYCL_DNNL=0)
558+
endif()
552559
if (WIN32)
553560
find_package(IntelSYCL REQUIRED)
554561
find_package(MKL REQUIRED)
@@ -561,6 +568,9 @@ if (GGML_SYCL)
561568
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
562569
endif()
563570
endif()
571+
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
572+
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
573+
endif()
564574
endif()
565575

566576
if (GGML_RPC)

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ typedef void (* fattn_kernel_t)(
2222
const float m0,
2323
const float m1,
2424
const uint32_t n_head_log2,
25+
const float logit_softcap,
2526
const int ne00,
2627
const int ne01,
2728
const int ne02,
@@ -657,11 +658,17 @@ void launch_fattn(
657658
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
658659
const int shmem = 0;
659660

660-
float scale = 1.0f;
661-
float max_bias = 0.0f;
661+
float scale = 1.0f;
662+
float max_bias = 0.0f;
663+
float logit_softcap = 0.0f;
662664

663-
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
664-
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
665+
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
666+
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
667+
memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
668+
669+
if (logit_softcap != 0.0f) {
670+
scale /= logit_softcap;
671+
}
665672

666673
const uint32_t n_head = Q->ne[2];
667674
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@@ -675,7 +682,7 @@ void launch_fattn(
675682
V_data,
676683
mask ? ((const char *) mask->data) : nullptr,
677684
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
678-
scale, max_bias, m0, m1, n_head_log2,
685+
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
679686
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
680687
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
681688
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,

ggml/src/ggml-cuda/fattn-tile-f16.cu

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
#define FATTN_KQ_STRIDE_TILE_F16 64
66

7-
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
7+
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
88
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
99
__launch_bounds__(nwarps*WARP_SIZE, 1)
1010
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f16(
2020
const float m0,
2121
const float m1,
2222
const uint32_t n_head_log2,
23+
const float logit_softcap,
2324
const int ne00,
2425
const int ne01,
2526
const int ne02,
@@ -44,6 +45,12 @@ static __global__ void flash_attn_tile_ext_f16(
4445
const int ne2,
4546
const int ne3) {
4647
#ifdef FP16_AVAILABLE
48+
// Skip unused kernel variants for faster compilation:
49+
if (use_logit_softcap && !(D == 128 || D == 256)) {
50+
NO_DEVICE_CODE;
51+
return;
52+
}
53+
4754
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
4855

4956
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@@ -154,7 +161,13 @@ static __global__ void flash_attn_tile_ext_f16(
154161
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
155162
const int j_KQ = j_KQ_0 + threadIdx.y;
156163

157-
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
164+
half sum;
165+
if (use_logit_softcap) {
166+
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
167+
sum = logit_softcap * tanhf(tmp.x + tmp.y);
168+
} else {
169+
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
170+
}
158171
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
159172

160173
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
@@ -270,20 +283,20 @@ static __global__ void flash_attn_tile_ext_f16(
270283
#endif // FP16_AVAILABLE
271284
}
272285

273-
template <int cols_per_block, int parallel_blocks>
286+
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
274287
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
275288
const ggml_tensor * Q = dst->src[0];
276289
switch (Q->ne[0]) {
277290
case 64: {
278291
constexpr int D = 64;
279292
constexpr int nwarps = 8;
280-
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
293+
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
281294
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
282295
} break;
283296
case 128: {
284297
constexpr int D = 128;
285298
constexpr int nwarps = 8;
286-
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
299+
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
287300
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
288301
} break;
289302
default: {
@@ -296,24 +309,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
296309
const ggml_tensor * KQV = dst;
297310
const ggml_tensor * Q = dst->src[0];
298311

299-
const int32_t precision = KQV->op_params[2];
312+
const int32_t precision = KQV->op_params[3];
300313
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
301314

315+
float logit_softcap;
316+
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
317+
302318
if (Q->ne[1] <= 16) {
303319
constexpr int cols_per_block = 16;
304320
constexpr int parallel_blocks = 4;
305-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
321+
if (logit_softcap == 0.0f) {
322+
constexpr bool use_logit_softcap = false;
323+
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
324+
} else {
325+
constexpr bool use_logit_softcap = true;
326+
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
327+
}
306328
return;
307329
}
308330

309331
if (Q->ne[1] <= 32) {
310332
constexpr int cols_per_block = 32;
311333
constexpr int parallel_blocks = 4;
312-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
334+
if (logit_softcap == 0.0f) {
335+
constexpr bool use_logit_softcap = false;
336+
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
337+
} else {
338+
constexpr bool use_logit_softcap = true;
339+
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
340+
}
313341
return;
314342
}
315343

316344
constexpr int cols_per_block = 32;
317345
constexpr int parallel_blocks = 1;
318-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
346+
if (logit_softcap == 0.0f) {
347+
constexpr bool use_logit_softcap = false;
348+
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
349+
} else {
350+
constexpr bool use_logit_softcap = true;
351+
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
352+
}
319353
}

0 commit comments

Comments
 (0)