diff --git a/CMakeLists.txt b/CMakeLists.txt index ffda74a700bef..7981c74153426 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") +set(LLAMA_TRAINIG_CTX "2176" CACHE STRING "llama: model training maximum context") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -125,6 +126,8 @@ set(CMAKE_C_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) +add_compile_definitions(GGML_TRAINING_CTX=${LLAMA_TRAINIG_CTX}) + if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) add_compile_options(-fsanitize=thread) diff --git a/Makefile b/Makefile index bda11791d5e23..e50917be48a4b 100644 --- a/Makefile +++ b/Makefile @@ -130,6 +130,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif endif +ifdef LLAMA_TRAINIG_CTX + CFLAGS += -DGGML_TRAINING_CTX=$(LLAMA_TRAINIG_CTX) + CXXFLAGS += -DGGML_TRAINING_CTX=$(LLAMA_TRAINIG_CTX) +endif + ifndef LLAMA_NO_K_QUANTS CFLAGS += -DGGML_USE_K_QUANTS CXXFLAGS += -DGGML_USE_K_QUANTS diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c34e96abfd08c..2fc3e19bb07a9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2175,10 +2175,13 @@ inline void ggml_cuda_op_rope( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; + const int n_ctx = ((int32_t *) src1->data)[3]; GGML_ASSERT(mode == 0); const float theta_scale = powf(10000.0, -2.0f/n_dims); - const float p = ((mode & 1) == 0 ? n_past + i02 : i02); + const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02); + + const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx; // compute rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); diff --git a/ggml.c b/ggml.c index 92faf03f746a1..ade982a5af0ab 100644 --- a/ggml.c +++ b/ggml.c @@ -12535,6 +12535,9 @@ static void ggml_compute_forward_rope_f32( dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; } } else if (!is_neox) { + if (n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12675,6 +12678,9 @@ static void ggml_compute_forward_rope_f16( dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); } } if (!is_neox) { + if (n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12760,6 +12766,7 @@ static void ggml_compute_forward_rope_back_f32( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; + const int n_ctx = ((int32_t *) src1->data)[3]; assert(n_past >= 0); @@ -12813,6 +12820,9 @@ static void ggml_compute_forward_rope_back_f32( float theta = (float)p; if (!is_neox) { + if (n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12873,6 +12883,7 @@ static void ggml_compute_forward_rope_back_f16( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; + const int n_ctx = ((int32_t *) src1->data)[3]; assert(n_past >= 0); @@ -12926,6 +12937,9 @@ static void ggml_compute_forward_rope_back_f16( float theta = (float)p; if (!is_neox) { + if (n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); diff --git a/ggml.h b/ggml.h index 459913222e083..13ca0c9ac8984 100644 --- a/ggml.h +++ b/ggml.h @@ -201,6 +201,12 @@ #define GGML_MAX_NAME 48 #define GGML_DEFAULT_N_THREADS 4 +// Maximum training context of the model in use +// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B) +#ifndef GGML_TRAINING_CTX +#define GGML_TRAINING_CTX 2176 +#endif + #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ diff --git a/llama.cpp b/llama.cpp index 2482bdd18d2e7..b107fc3270666 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1491,11 +1491,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur");