Skip to content

Commit ceca1ae

Browse files
[SYCL] fix error when set main gpu to non-zero (#5901)
* fix error when set main gpu to non-zero * fix delete condition
1 parent e04e04f commit ceca1ae

File tree

3 files changed

+107
-64
lines changed

3 files changed

+107
-64
lines changed

ggml-sycl.cpp

Lines changed: 94 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3559,12 +3559,31 @@ class sycl_gpu_mgr {
35593559
int work_group_size = 0;
35603560
std::string gpus_list = "";
35613561

3562+
/*
3563+
Use all GPU with same top max compute units
3564+
*/
35623565
sycl_gpu_mgr() {
35633566
detect_sycl_gpu_list_with_max_cu();
35643567
get_allow_gpus();
35653568
create_context_with_gpus();
35663569
}
35673570

3571+
/*
3572+
Use the assigned GPU as only one
3573+
*/
3574+
sycl_gpu_mgr(int main_gpu_id) {
3575+
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
3576+
dpct::device_info prop;
3577+
dpct::get_device_info(prop, device);
3578+
gpus.push_back(main_gpu_id);
3579+
devices.push_back(device);
3580+
work_group_size = prop.get_max_work_group_size();
3581+
max_compute_units = prop.get_max_compute_units();
3582+
3583+
get_allow_gpus();
3584+
create_context_with_gpus();
3585+
}
3586+
35683587
void create_context_with_gpus() {
35693588
sycl::context ctx = sycl::context(devices);
35703589
assert(gpus.size() > 0);
@@ -3580,7 +3599,7 @@ class sycl_gpu_mgr {
35803599
gpus_list += std::to_string(gpus[i]);
35813600
gpus_list += ",";
35823601
}
3583-
if (gpus_list.length() > 2) {
3602+
if (gpus_list.length() > 1) {
35843603
gpus_list.pop_back();
35853604
}
35863605
}
@@ -3629,8 +3648,8 @@ class sycl_gpu_mgr {
36293648
if (gpus[i] == id)
36303649
return i;
36313650
}
3632-
assert(false);
3633-
return -1;
3651+
printf("miss to get device index by id=%d\n", id);
3652+
GGML_ASSERT(false);
36343653
}
36353654

36363655
int get_next_index(int id) {
@@ -3639,8 +3658,7 @@ class sycl_gpu_mgr {
36393658
if (gpus[i] == id)
36403659
return i;
36413660
}
3642-
assert(false);
3643-
return -1;
3661+
GGML_ASSERT(false);
36443662
}
36453663
};
36463664

@@ -3649,6 +3667,7 @@ static int g_device_count = -1;
36493667
static int g_all_sycl_device_count = -1;
36503668
static int g_main_device = -1;
36513669
static int g_main_device_id = -1;
3670+
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
36523671

36533672
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
36543673

@@ -13225,7 +13244,7 @@ void ggml_backend_sycl_print_sycl_devices() {
1322513244
}
1322613245

1322713246
void print_gpu_device_list() {
13228-
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
13247+
fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
1322913248
g_sycl_gpu_mgr->get_gpu_count(),
1323013249
g_sycl_gpu_mgr->gpus_list.c_str(),
1323113250
g_sycl_gpu_mgr->max_compute_units);
@@ -13264,6 +13283,15 @@ void ggml_init_sycl() try {
1326413283
#else
1326513284
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
1326613285
#endif
13286+
13287+
/* NOT REMOVE, keep it for next optimize for XMX.
13288+
#if defined(SYCL_USE_XMX)
13289+
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13290+
#else
13291+
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13292+
#endif
13293+
*/
13294+
1326713295
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
1326813296
dpct::dev_mgr::instance().device_count()) != 0) {
1326913297
initialized = true;
@@ -13272,68 +13300,61 @@ void ggml_init_sycl() try {
1327213300
}
1327313301
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
1327413302
ggml_backend_sycl_print_sycl_devices();
13275-
1327613303
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
13304+
print_gpu_device_list();
13305+
initialized = true;
13306+
g_sycl_loaded = true;
13307+
}
1327713308

13278-
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13279-
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
1328013309

13281-
print_gpu_device_list();
1328213310

13283-
int64_t total_vram = 0;
13311+
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13312+
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
1328413313

13285-
/* NOT REMOVE, keep it for next optimize for XMX.
13286-
#if defined(SYCL_USE_XMX)
13287-
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13288-
#else
13289-
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13290-
#endif
13291-
*/
13292-
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13293-
g_device_caps[id].vmm = 0;
13294-
g_device_caps[id].device_id = -1;
13295-
g_device_caps[id].cc = 0;
13296-
g_tensor_split[id] = 0;
13297-
g_default_tensor_split[id] = 0;
13298-
}
13314+
int64_t total_vram = 0;
1329913315

13300-
for (int i = 0; i < g_device_count; ++i) {
13301-
int device_id = g_sycl_gpu_mgr->gpus[i];
13302-
g_device_caps[i].vmm = 0;
1330313316

13304-
dpct::device_info prop;
13305-
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13306-
prop, dpct::dev_mgr::instance().get_device(device_id))));
13317+
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13318+
g_device_caps[id].vmm = 0;
13319+
g_device_caps[id].device_id = -1;
13320+
g_device_caps[id].cc = 0;
13321+
g_tensor_split[id] = 0;
13322+
g_default_tensor_split[id] = 0;
13323+
}
1330713324

13308-
g_default_tensor_split[i] = total_vram;
13309-
total_vram += prop.get_global_mem_size();
13325+
for (int i = 0; i < g_device_count; ++i) {
13326+
int device_id = g_sycl_gpu_mgr->gpus[i];
13327+
g_device_caps[i].vmm = 0;
1331013328

13311-
g_device_caps[i].cc =
13312-
100 * prop.get_major_version() + 10 * prop.get_minor_version();
13313-
}
13329+
dpct::device_info prop;
13330+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13331+
prop, dpct::dev_mgr::instance().get_device(device_id))));
1331413332

13315-
for (int i = 0; i < g_device_count; ++i) {
13316-
g_default_tensor_split[i] /= total_vram;
13317-
}
13333+
g_default_tensor_split[i] = total_vram;
13334+
total_vram += prop.get_global_mem_size();
1331813335

13319-
for (int i = 0; i < g_device_count; ++i) {
13320-
SYCL_CHECK(ggml_sycl_set_device(i));
13336+
g_device_caps[i].cc =
13337+
100 * prop.get_major_version() + 10 * prop.get_minor_version();
13338+
}
1332113339

13322-
// create sycl streams
13323-
for (int is = 0; is < MAX_STREAMS; ++is) {
13324-
SYCL_CHECK(CHECK_TRY_ERROR(
13325-
g_syclStreams[i][is] =
13326-
dpct::get_current_device().create_queue(
13327-
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13328-
}
13340+
for (int i = 0; i < g_device_count; ++i) {
13341+
g_default_tensor_split[i] /= total_vram;
13342+
}
13343+
13344+
for (int i = 0; i < g_device_count; ++i) {
13345+
SYCL_CHECK(ggml_sycl_set_device(i));
1332913346

13330-
const dpct::queue_ptr stream = g_syclStreams[i][0];
13331-
// create sycl handle
13332-
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13347+
// create sycl streams
13348+
for (int is = 0; is < MAX_STREAMS; ++is) {
13349+
SYCL_CHECK(CHECK_TRY_ERROR(
13350+
g_syclStreams[i][is] =
13351+
dpct::get_current_device().create_queue(
13352+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
1333313353
}
1333413354

13335-
initialized = true;
13336-
g_sycl_loaded = true;
13355+
const dpct::queue_ptr stream = g_syclStreams[i][0];
13356+
// create sycl handle
13357+
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
1333713358
}
1333813359
}
1333913360
catch (sycl::exception const &exc) {
@@ -16732,22 +16753,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
1673216753
/* .is_host = */ nullptr,
1673316754
};
1673416755

16735-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
16756+
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16757+
if (device_index>=g_device_count or device_index<0) {
16758+
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16759+
device_index, g_device_count-1);
16760+
GGML_ASSERT(device_index<g_device_count);
16761+
}
1673616762
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
1673716763

16738-
static bool ggml_backend_sycl_buffer_type_initialized = false;
16739-
16740-
if (!ggml_backend_sycl_buffer_type_initialized) {
16764+
if (!g_ggml_backend_sycl_buffer_type_initialized) {
1674116765
for (int i = 0; i < g_device_count; i++) {
1674216766
ggml_backend_sycl_buffer_types[i] = {
1674316767
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
1674416768
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
1674516769
};
1674616770
}
16747-
ggml_backend_sycl_buffer_type_initialized = true;
16771+
g_ggml_backend_sycl_buffer_type_initialized = true;
1674816772
}
16749-
16750-
return &ggml_backend_sycl_buffer_types[device];
16773+
return &ggml_backend_sycl_buffer_types[device_index];
1675116774
}
1675216775

1675316776
// sycl split buffer type
@@ -17496,6 +17519,17 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
1749617519
return g_sycl_gpu_mgr->get_index(device_id);
1749717520
}
1749817521

17522+
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
17523+
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17524+
printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
17525+
if (g_sycl_gpu_mgr) {
17526+
delete g_sycl_gpu_mgr;
17527+
}
17528+
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
17529+
ggml_init_sycl();
17530+
g_ggml_backend_sycl_buffer_type_initialized = false;
17531+
}
17532+
1749917533
extern "C" int ggml_backend_sycl_reg_devices();
1750017534

1750117535
int ggml_backend_sycl_reg_devices() {

ggml-sycl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
2828
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
2929
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
3030
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
31+
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);
3132

3233
#ifdef __cplusplus
3334
}

llama.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3750,6 +3750,14 @@ static bool llm_load_tensors(
37503750
model.main_gpu = main_gpu;
37513751
model.n_gpu_layers = n_gpu_layers;
37523752

3753+
#ifdef GGML_USE_SYCL
3754+
if (split_mode == LLAMA_SPLIT_MODE_NONE) {
3755+
ggml_backend_sycl_set_single_device(main_gpu);
3756+
//SYCL use device index (0, 1, 2), instead if device id.
3757+
main_gpu = ggml_backend_sycl_get_device_index(main_gpu);
3758+
}
3759+
#endif
3760+
37533761
const int64_t n_layer = hparams.n_layer;
37543762
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
37553763

@@ -12260,13 +12268,13 @@ struct llama_context * llama_new_context_with_model(
1226012268
ctx->backends.push_back(backend);
1226112269
} else {
1226212270
// LLAMA_SPLIT_LAYER requires a backend for each GPU
12263-
int id_list[GGML_SYCL_MAX_DEVICES];
12264-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12271+
1226512272
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12266-
int device_id = id_list[i];
1226712273
ggml_backend_t backend = ggml_backend_sycl_init(i);
1226812274
if (backend == nullptr) {
12269-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12275+
int id_list[GGML_SYCL_MAX_DEVICES];
12276+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12277+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, id_list[i], i);
1227012278
llama_free(ctx);
1227112279
return nullptr;
1227212280
}

0 commit comments

Comments
 (0)