Skip to content

Modified RoPE with linear scaling #2019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_TRAINIG_CTX "2176" CACHE STRING "llama: model training maximum context")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be LLAMA_TRAINING_CTX?

option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
Expand Down Expand Up @@ -125,6 +126,8 @@ set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)

add_compile_definitions(GGML_TRAINING_CTX=${LLAMA_TRAINIG_CTX})

if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread)
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif
endif

ifdef LLAMA_TRAINIG_CTX
CFLAGS += -DGGML_TRAINING_CTX=$(LLAMA_TRAINIG_CTX)
CXXFLAGS += -DGGML_TRAINING_CTX=$(LLAMA_TRAINIG_CTX)
endif

ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
Expand Down
5 changes: 4 additions & 1 deletion ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2175,10 +2175,13 @@ inline void ggml_cuda_op_rope(
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
GGML_ASSERT(mode == 0);

const float theta_scale = powf(10000.0, -2.0f/n_dims);
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);

const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx;

// compute
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
Expand Down
14 changes: 14 additions & 0 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -12535,6 +12535,9 @@ static void ggml_compute_forward_rope_f32(
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
}
} else if (!is_neox) {
if (n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
Expand Down Expand Up @@ -12675,6 +12678,9 @@ static void ggml_compute_forward_rope_f16(
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
}
} if (!is_neox) {
if (n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
Expand Down Expand Up @@ -12760,6 +12766,7 @@ static void ggml_compute_forward_rope_back_f32(
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];

assert(n_past >= 0);

Expand Down Expand Up @@ -12813,6 +12820,9 @@ static void ggml_compute_forward_rope_back_f32(
float theta = (float)p;

if (!is_neox) {
if (n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
Expand Down Expand Up @@ -12873,6 +12883,7 @@ static void ggml_compute_forward_rope_back_f16(
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];

assert(n_past >= 0);

Expand Down Expand Up @@ -12926,6 +12937,9 @@ static void ggml_compute_forward_rope_back_f16(
float theta = (float)p;

if (!is_neox) {
if (n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
Expand Down
6 changes: 6 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,12 @@
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4

// Maximum training context of the model in use
// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
#ifndef GGML_TRAINING_CTX
#define GGML_TRAINING_CTX 2176
#endif

#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
Expand Down
4 changes: 2 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1491,11 +1491,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq");

struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur");

struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur");

Expand Down