Skip to content

cann: add Ascend NPU support #2336

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ggml/src/ggml-cann/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.

PROJECT_NAME = "llama.cpp"
PROJECT_NAME = "whisper.cpp"

# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
Expand All @@ -44,7 +44,7 @@ PROJECT_NUMBER =
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.

PROJECT_BRIEF = "llama inference engine"
PROJECT_BRIEF = "Port of OpenAI's Whisper model in C/C++"

# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55
Expand Down
22 changes: 20 additions & 2 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
#include "openvino/whisper-openvino-encoder.h"
#endif

#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif

#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
Expand Down Expand Up @@ -1283,6 +1287,16 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
}
#endif

#ifdef GGML_USE_CANN
if (params.use_gpu) {
WHISPER_LOG_INFO("%s: using CANN backend\n", __func__);
result = ggml_backend_cann_init(params.gpu_device);
if (!result) {
WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__);
}
}
#endif

return result;
}

Expand Down Expand Up @@ -1335,6 +1349,10 @@ static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_cont
result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
#endif

#ifdef GGML_USE_CANN
result || (result == ggml_backend_cann_buffer_type(params.gpu_device));
#endif

result || (result = ggml_backend_cpu_buffer_type());

return result;
Expand Down Expand Up @@ -4337,8 +4355,8 @@ const char * whisper_print_system_info(void) {
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) ;

s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
s += "CANN = " + std::to_string(ggml_cpu_has_cann()) ;
return s.c_str();
}

Expand Down
123 changes: 106 additions & 17 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include <ggml.h>
#include <ggml-alloc.h>
#include <ggml-backend.h>
#include <ggml-backend-impl.h>

#include <algorithm>
#include <array>
Expand Down Expand Up @@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
im = nullptr;
}
}

ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
// TODO: other cases
//#pragma omp parallel for
//for (int i = 0; i < tensor->ne[1]; i++) {
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
// i * tensor->ne[0], 1, tensor->ne[0], im);
//}

ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though.
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

Expand Down Expand Up @@ -125,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
tt.to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
}
Expand Down Expand Up @@ -760,7 +767,7 @@ struct test_dup : public test_case {
}

test_dup(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 1},
std::array<int64_t, 4> ne = {10, 10, 20, 1},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure thers's no existing scenes are missed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This modification causes ne and nb to change after permute, thus covering test_dup of non-continuous data. Keeping same with https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L770

std::array<int64_t, 4> permute = {0, 0, 0, 0})
: type(type), ne(ne), permute(permute),
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
Expand All @@ -780,9 +787,11 @@ struct test_cpy : public test_case {
const ggml_type type_src;
const ggml_type type_dst;
const std::array<int64_t, 4> ne;
const std::array<int64_t, 4> permute;
bool _src_use_permute;

std::string vars() override {
return VARS_TO_STR3(type_src, type_dst, ne);
return VARS_TO_STR4(type_src, type_dst, ne, permute);
}

double max_nmse_err() override {
Expand All @@ -794,12 +803,17 @@ struct test_cpy : public test_case {
}

test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 1})
: type_src(type_src), type_dst(type_dst), ne(ne) {}
std::array<int64_t, 4> ne = {10, 10, 10, 1},
std::array<int64_t, 4> permute = {0, 0, 0, 0})
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
if (_src_use_permute) {
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
}
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
ggml_tensor * out = ggml_cpy(ctx, src, dst);
return out;
}
Expand Down Expand Up @@ -1175,6 +1189,7 @@ struct test_soft_max : public test_case {
}
};


// GGML_OP_ROPE
struct test_rope : public test_case {
const ggml_type type;
Expand Down Expand Up @@ -1267,6 +1282,32 @@ struct test_pool2d : public test_case {
}
};

// GGML_OP_CONV_TRANSPOSE_1D
struct test_conv_transpose_1d : public test_case {
const std::array<int64_t, 4> ne_input;
const std::array<int64_t, 4> ne_kernel;

const int s0; // stride
const int p0; // padding
const int d0; // dilation

std::string vars() override {
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
}

test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
int s0 = 1, int p0 = 0, int d0 = 1)
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
return out;
}
};

// GGML_OP_IM2COL
struct test_im2col : public test_case {
const ggml_type type_input;
Expand All @@ -1280,7 +1321,7 @@ struct test_im2col : public test_case {
// padding
const int p0;
const int p1;
// dilatation
// dilation
const int d0;
const int d1;
// mode
Expand Down Expand Up @@ -1393,7 +1434,7 @@ struct test_argsort : public test_case {
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
}
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
}
Expand Down Expand Up @@ -1470,19 +1511,21 @@ struct test_group_norm : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const int32_t num_groups;
const float eps;

std::string vars() override {
return VARS_TO_STR3(type, ne, num_groups);
}

test_group_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 64, 320, 1},
int32_t num_groups = 32)
: type(type), ne(ne), num_groups(num_groups) {}
int32_t num_groups = 32,
float eps = 1e-6f)
: type(type), ne(ne), num_groups(num_groups), eps(eps) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
return out;
}
};
Expand Down Expand Up @@ -2053,6 +2096,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
GGML_TYPE_BF16,
};

// unary ops
Expand Down Expand Up @@ -2097,6 +2141,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op

test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
// test cases for 1D im2col
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));

test_cases.emplace_back(new test_conv_transpose_1d());
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));


test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
Expand All @@ -2110,12 +2167,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));

for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
for (ggml_type type_dst : all_types) {
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
}
}
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
}
}

Expand Down Expand Up @@ -2165,6 +2232,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
}

#if 1
for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
Expand All @@ -2184,10 +2252,31 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
}
}
#else
// m = a rows
// n = b rows
// k = cols
std::uniform_int_distribution<> dist_m(1, 128);
std::uniform_int_distribution<> dist_n(16, 128);
std::uniform_int_distribution<> dist_k(1, 16);
for (int i = 0; i < 1000; i++) {
for (ggml_type type_a : all_types) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
int m = dist_m(rng);
int n = dist_n(rng);
int k = dist_k(rng) * ggml_blck_size(type_a);
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
}
}
}
#endif

for (ggml_type type_a : other_types) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
if (ggml_blck_size(type_a) != 256) {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
}
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
}
}

Expand Down Expand Up @@ -2247,7 +2336,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
for (int n = 0; n < 10; ++n) {
int64_t ne0 = dist_ne0(rng);
int64_t ne1 = dist_ne1(rng);
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
}

exponent <<= 1;
Expand All @@ -2266,7 +2355,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
}
}

test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
Expand Down Expand Up @@ -2380,7 +2469,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
return true;
}

GGML_ASSERT(false);
GGML_ABORT("fatal error");
return false;
}

Expand Down
Loading