Skip to content

Commit 201cc11

Browse files
llama : add phi3 128K model support (#7225)
* add phi3 128k support in convert-hf-to-gguf * add phi3 128k support in cuda * address build warnings on llama.cpp * adjust index value in cuda long rope freq factors * add long rope support in ggml cpu backend * make freq factors only depend on ctx size * remove unused rope scaling type 'su' frin gguf converter * fix flint warnings on convert-hf-to-gguf.py * set to the short freq factor when context size is small than trained context size * add one line of comments * metal : support rope freq_factors * ggml : update ggml_rope_ext API to support freq. factors * backends : add dev messages to support rope freq. factors * minor : style * tests : update to use new rope API * backends : fix pragma semicolons * minor : cleanup * llama : move rope factors from KV header to tensors * llama : remove tmp assert * cuda : fix compile warning * convert : read/write n_head_kv * llama : fix uninitialized tensors --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 6369bf0 commit 201cc11

File tree

15 files changed

+478
-227
lines changed

15 files changed

+478
-227
lines changed

convert-hf-to-gguf.py

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from hashlib import sha256
1515
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
1616

17+
import math
1718
import numpy as np
1819
import torch
1920

@@ -1784,23 +1785,59 @@ def set_vocab(self):
17841785
def set_gguf_parameters(self):
17851786
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
17861787

1787-
rot_pct = 1.0
17881788
n_embd = self.find_hparam(["hidden_size", "n_embd"])
17891789
n_head = self.find_hparam(["num_attention_heads", "n_head"])
1790+
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
17901791
rms_eps = self.find_hparam(["rms_norm_eps"])
1792+
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
1793+
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
1794+
rope_dims = n_embd // n_head
17911795

17921796
self.gguf_writer.add_name("Phi3")
1793-
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
1794-
1797+
self.gguf_writer.add_context_length(max_pos_embds)
1798+
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
17951799
self.gguf_writer.add_embedding_length(n_embd)
1796-
self.gguf_writer.add_feed_forward_length(8192)
1800+
self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
17971801
self.gguf_writer.add_block_count(block_count)
17981802
self.gguf_writer.add_head_count(n_head)
1799-
self.gguf_writer.add_head_count_kv(n_head)
1803+
self.gguf_writer.add_head_count_kv(n_head_kv)
18001804
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
1801-
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
1805+
self.gguf_writer.add_rope_dimension_count(rope_dims)
1806+
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
18021807
self.gguf_writer.add_file_type(self.ftype)
18031808

1809+
# write rope scaling for long context (128k) model
1810+
rope_scaling = self.find_hparam(['rope_scaling'], True)
1811+
if (rope_scaling is None):
1812+
return
1813+
1814+
scale = max_pos_embds / orig_max_pos_embds
1815+
1816+
rope_scaling_type = rope_scaling.get('type', '').lower()
1817+
if len(rope_scaling_type) == 0:
1818+
raise KeyError('Missing the required key rope_scaling.type')
1819+
1820+
if rope_scaling_type == 'su':
1821+
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
1822+
elif rope_scaling_type == 'yarn':
1823+
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
1824+
else:
1825+
raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
1826+
1827+
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
1828+
1829+
long_factors = rope_scaling.get('long_factor', None)
1830+
short_factors = rope_scaling.get('short_factor', None)
1831+
1832+
if long_factors is None or short_factors is None:
1833+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1834+
1835+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
1836+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
1837+
1838+
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
1839+
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
1840+
18041841

18051842
@Model.register("PlamoForCausalLM")
18061843
class PlamoModel(Model):

examples/finetune/finetune.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
563563
// not capturing these, to silcence warnings
564564
const int rope_mode = 0;
565565

566-
return ggml_rope_custom(ctx,
567-
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
566+
return ggml_rope_ext(ctx,
567+
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
568568
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
569569
);
570570
};

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
301301
// not capturing these, to silcence warnings
302302
const int rope_mode = 0;
303303

304-
return ggml_rope_custom(
305-
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
304+
return ggml_rope_ext(
305+
ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
306306
);
307307
};
308308

ggml-cuda/rope.cu

Lines changed: 48 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ static __global__ void rope(
5858
dst[i + 1] = x0*sin_theta + x1*cos_theta;
5959
}
6060

61-
template<typename T, bool has_pos>
61+
template<typename T, bool has_pos, bool has_freq_facs>
6262
static __global__ void rope_neox(
6363
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
64-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
64+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
6565
) {
6666
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
6767

@@ -88,7 +88,9 @@ static __global__ void rope_neox(
8888
float cur_rot = inv_ndims * ic - ib;
8989

9090
const int p = has_pos ? pos[i2] : 0;
91-
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
91+
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
92+
93+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
9294

9395
float cos_theta, sin_theta;
9496
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
@@ -164,7 +166,7 @@ static void rope_cuda(
164166
template<typename T>
165167
static void rope_neox_cuda(
166168
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
167-
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
169+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
168170
) {
169171
GGML_ASSERT(ncols % 2 == 0);
170172
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
@@ -175,15 +177,29 @@ static void rope_neox_cuda(
175177
const float inv_ndims = -1.0f / n_dims;
176178

177179
if (pos == nullptr) {
178-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
179-
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
180-
theta_scale, inv_ndims
181-
);
180+
if (freq_factors == nullptr) {
181+
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
182+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
183+
theta_scale, inv_ndims, freq_factors
184+
);
185+
} else {
186+
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
187+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
188+
theta_scale, inv_ndims, freq_factors
189+
);
190+
}
182191
} else {
183-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
184-
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
185-
theta_scale, inv_ndims
186-
);
192+
if (freq_factors == nullptr) {
193+
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
194+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
195+
theta_scale, inv_ndims, freq_factors
196+
);
197+
} else {
198+
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
199+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
200+
theta_scale, inv_ndims, freq_factors
201+
);
202+
}
187203
}
188204
}
189205

@@ -214,24 +230,27 @@ static void rope_cuda_f32(
214230

215231
static void rope_neox_cuda_f16(
216232
const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
217-
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
233+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
218234

219-
rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
235+
rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
220236
}
221237

222238
static void rope_neox_cuda_f32(
223239
const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
224-
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
240+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
225241
) {
226242

227-
rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
243+
rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
228244
}
229245

230246
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
231247
const ggml_tensor * src0 = dst->src[0];
232248
const ggml_tensor * src1 = dst->src[1];
249+
const ggml_tensor * src2 = dst->src[2];
250+
233251
const float * src0_d = (const float *)src0->data;
234252
const float * src1_d = (const float *)src1->data;
253+
235254
float * dst_d = (float *)dst->data;
236255
cudaStream_t stream = ctx.stream();
237256

@@ -241,7 +260,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
241260

242261
const int64_t ne00 = src0->ne[0];
243262
const int64_t ne01 = src0->ne[1];
244-
const int64_t ne2 = dst->ne[2];
245263
const int64_t nrows = ggml_nrows(src0);
246264

247265
//const int n_past = ((int32_t *) dst->op_params)[0];
@@ -259,16 +277,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
259277
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
260278
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
261279

280+
const float * freq_factors = nullptr;
262281
const int32_t * pos = nullptr;
263-
if ((mode & 1) == 0) {
264-
GGML_ASSERT(src1->type == GGML_TYPE_I32);
265-
GGML_ASSERT(src1->ne[0] == ne2);
266-
pos = (const int32_t *) src1_d;
267-
}
268282

269283
const bool is_neox = mode & 2;
270284
const bool is_glm = mode & 4;
271285

286+
if (is_neox) {
287+
pos = (const int32_t *) src1_d;
288+
289+
if (src2 != nullptr) {
290+
freq_factors = (const float *) src2->data;
291+
}
292+
} else {
293+
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
294+
}
295+
272296
rope_corr_dims corr_dims;
273297
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
274298

@@ -280,12 +304,12 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
280304
if (src0->type == GGML_TYPE_F32) {
281305
rope_neox_cuda_f32(
282306
(const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
283-
attn_factor, corr_dims, stream
307+
attn_factor, corr_dims, freq_factors, stream
284308
);
285309
} else if (src0->type == GGML_TYPE_F16) {
286310
rope_neox_cuda_f16(
287311
(const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
288-
attn_factor, corr_dims, stream
312+
attn_factor, corr_dims, freq_factors, stream
289313
);
290314
} else {
291315
GGML_ASSERT(false);

ggml-kompute.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
16771677
} break;
16781678
case GGML_OP_ROPE:
16791679
{
1680+
#pragma message("TODO: implement phi3 frequency factors support")
1681+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683+
16801684
GGML_ASSERT(ne10 == ne02);
16811685
GGML_ASSERT(src0t == dstt);
16821686
// const int n_past = ((int32_t *) dst->op_params)[0];

0 commit comments

Comments
 (0)