@@ -3559,12 +3559,31 @@ class sycl_gpu_mgr {
3559
3559
int work_group_size = 0;
3560
3560
std::string gpus_list = "";
3561
3561
3562
+ /*
3563
+ Use all GPU with same top max compute units
3564
+ */
3562
3565
sycl_gpu_mgr() {
3563
3566
detect_sycl_gpu_list_with_max_cu();
3564
3567
get_allow_gpus();
3565
3568
create_context_with_gpus();
3566
3569
}
3567
3570
3571
+ /*
3572
+ Use the assigned GPU as only one
3573
+ */
3574
+ sycl_gpu_mgr(int main_gpu_id) {
3575
+ sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
3576
+ dpct::device_info prop;
3577
+ dpct::get_device_info(prop, device);
3578
+ gpus.push_back(main_gpu_id);
3579
+ devices.push_back(device);
3580
+ work_group_size = prop.get_max_work_group_size();
3581
+ max_compute_units = prop.get_max_compute_units();
3582
+
3583
+ get_allow_gpus();
3584
+ create_context_with_gpus();
3585
+ }
3586
+
3568
3587
void create_context_with_gpus() {
3569
3588
sycl::context ctx = sycl::context(devices);
3570
3589
assert(gpus.size() > 0);
@@ -3580,7 +3599,7 @@ class sycl_gpu_mgr {
3580
3599
gpus_list += std::to_string(gpus[i]);
3581
3600
gpus_list += ",";
3582
3601
}
3583
- if (gpus_list.length() > 2 ) {
3602
+ if (gpus_list.length() > 1 ) {
3584
3603
gpus_list.pop_back();
3585
3604
}
3586
3605
}
@@ -3629,8 +3648,8 @@ class sycl_gpu_mgr {
3629
3648
if (gpus[i] == id)
3630
3649
return i;
3631
3650
}
3632
- assert(false );
3633
- return -1 ;
3651
+ printf("miss to get device index by id=%d\n", id );
3652
+ GGML_ASSERT(false) ;
3634
3653
}
3635
3654
3636
3655
int get_next_index(int id) {
@@ -3639,8 +3658,7 @@ class sycl_gpu_mgr {
3639
3658
if (gpus[i] == id)
3640
3659
return i;
3641
3660
}
3642
- assert(false);
3643
- return -1;
3661
+ GGML_ASSERT(false);
3644
3662
}
3645
3663
};
3646
3664
@@ -3649,6 +3667,7 @@ static int g_device_count = -1;
3649
3667
static int g_all_sycl_device_count = -1;
3650
3668
static int g_main_device = -1;
3651
3669
static int g_main_device_id = -1;
3670
+ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
3652
3671
3653
3672
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
3654
3673
@@ -13225,7 +13244,7 @@ void ggml_backend_sycl_print_sycl_devices() {
13225
13244
}
13226
13245
13227
13246
void print_gpu_device_list() {
13228
- fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
13247
+ fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
13229
13248
g_sycl_gpu_mgr->get_gpu_count(),
13230
13249
g_sycl_gpu_mgr->gpus_list.c_str(),
13231
13250
g_sycl_gpu_mgr->max_compute_units);
@@ -13264,6 +13283,15 @@ void ggml_init_sycl() try {
13264
13283
#else
13265
13284
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
13266
13285
#endif
13286
+
13287
+ /* NOT REMOVE, keep it for next optimize for XMX.
13288
+ #if defined(SYCL_USE_XMX)
13289
+ fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13290
+ #else
13291
+ fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13292
+ #endif
13293
+ */
13294
+
13267
13295
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
13268
13296
dpct::dev_mgr::instance().device_count()) != 0) {
13269
13297
initialized = true;
@@ -13272,68 +13300,61 @@ void ggml_init_sycl() try {
13272
13300
}
13273
13301
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
13274
13302
ggml_backend_sycl_print_sycl_devices();
13275
-
13276
13303
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
13304
+ print_gpu_device_list();
13305
+ initialized = true;
13306
+ g_sycl_loaded = true;
13307
+ }
13277
13308
13278
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13279
- g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13280
13309
13281
- print_gpu_device_list();
13282
13310
13283
- int64_t total_vram = 0;
13311
+ g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13312
+ g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13284
13313
13285
- /* NOT REMOVE, keep it for next optimize for XMX.
13286
- #if defined(SYCL_USE_XMX)
13287
- fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13288
- #else
13289
- fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13290
- #endif
13291
- */
13292
- for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13293
- g_device_caps[id].vmm = 0;
13294
- g_device_caps[id].device_id = -1;
13295
- g_device_caps[id].cc = 0;
13296
- g_tensor_split[id] = 0;
13297
- g_default_tensor_split[id] = 0;
13298
- }
13314
+ int64_t total_vram = 0;
13299
13315
13300
- for (int i = 0; i < g_device_count; ++i) {
13301
- int device_id = g_sycl_gpu_mgr->gpus[i];
13302
- g_device_caps[i].vmm = 0;
13303
13316
13304
- dpct::device_info prop;
13305
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13306
- prop, dpct::dev_mgr::instance().get_device(device_id))));
13317
+ for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13318
+ g_device_caps[id].vmm = 0;
13319
+ g_device_caps[id].device_id = -1;
13320
+ g_device_caps[id].cc = 0;
13321
+ g_tensor_split[id] = 0;
13322
+ g_default_tensor_split[id] = 0;
13323
+ }
13307
13324
13308
- g_default_tensor_split[i] = total_vram;
13309
- total_vram += prop.get_global_mem_size();
13325
+ for (int i = 0; i < g_device_count; ++i) {
13326
+ int device_id = g_sycl_gpu_mgr->gpus[i];
13327
+ g_device_caps[i].vmm = 0;
13310
13328
13311
- g_device_caps[i].cc =
13312
- 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13313
- }
13329
+ dpct::device_info prop;
13330
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13331
+ prop, dpct::dev_mgr::instance().get_device(device_id))));
13314
13332
13315
- for (int i = 0; i < g_device_count; ++i) {
13316
- g_default_tensor_split[i] /= total_vram;
13317
- }
13333
+ g_default_tensor_split[i] = total_vram;
13334
+ total_vram += prop.get_global_mem_size();
13318
13335
13319
- for (int i = 0; i < g_device_count; ++i) {
13320
- SYCL_CHECK(ggml_sycl_set_device(i));
13336
+ g_device_caps[i].cc =
13337
+ 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13338
+ }
13321
13339
13322
- // create sycl streams
13323
- for (int is = 0; is < MAX_STREAMS; ++is) {
13324
- SYCL_CHECK(CHECK_TRY_ERROR(
13325
- g_syclStreams[i][is] =
13326
- dpct::get_current_device().create_queue(
13327
- g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13328
- }
13340
+ for (int i = 0; i < g_device_count; ++i) {
13341
+ g_default_tensor_split[i] /= total_vram;
13342
+ }
13343
+
13344
+ for (int i = 0; i < g_device_count; ++i) {
13345
+ SYCL_CHECK(ggml_sycl_set_device(i));
13329
13346
13330
- const dpct::queue_ptr stream = g_syclStreams[i][0];
13331
- // create sycl handle
13332
- SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13347
+ // create sycl streams
13348
+ for (int is = 0; is < MAX_STREAMS; ++is) {
13349
+ SYCL_CHECK(CHECK_TRY_ERROR(
13350
+ g_syclStreams[i][is] =
13351
+ dpct::get_current_device().create_queue(
13352
+ g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13333
13353
}
13334
13354
13335
- initialized = true;
13336
- g_sycl_loaded = true;
13355
+ const dpct::queue_ptr stream = g_syclStreams[i][0];
13356
+ // create sycl handle
13357
+ SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13337
13358
}
13338
13359
}
13339
13360
catch (sycl::exception const &exc) {
@@ -16732,22 +16753,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16732
16753
/* .is_host = */ nullptr,
16733
16754
};
16734
16755
16735
- ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
16756
+ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16757
+ if (device_index>=g_device_count or device_index<0) {
16758
+ printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16759
+ device_index, g_device_count-1);
16760
+ GGML_ASSERT(device_index<g_device_count);
16761
+ }
16736
16762
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
16737
16763
16738
- static bool ggml_backend_sycl_buffer_type_initialized = false;
16739
-
16740
- if (!ggml_backend_sycl_buffer_type_initialized) {
16764
+ if (!g_ggml_backend_sycl_buffer_type_initialized) {
16741
16765
for (int i = 0; i < g_device_count; i++) {
16742
16766
ggml_backend_sycl_buffer_types[i] = {
16743
16767
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
16744
16768
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
16745
16769
};
16746
16770
}
16747
- ggml_backend_sycl_buffer_type_initialized = true;
16771
+ g_ggml_backend_sycl_buffer_type_initialized = true;
16748
16772
}
16749
-
16750
- return &ggml_backend_sycl_buffer_types[device];
16773
+ return &ggml_backend_sycl_buffer_types[device_index];
16751
16774
}
16752
16775
16753
16776
// sycl split buffer type
@@ -17496,6 +17519,17 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17496
17519
return g_sycl_gpu_mgr->get_index(device_id);
17497
17520
}
17498
17521
17522
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
17523
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17524
+ printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
17525
+ if (g_sycl_gpu_mgr) {
17526
+ delete g_sycl_gpu_mgr;
17527
+ }
17528
+ g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
17529
+ ggml_init_sycl();
17530
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17531
+ }
17532
+
17499
17533
extern "C" int ggml_backend_sycl_reg_devices();
17500
17534
17501
17535
int ggml_backend_sycl_reg_devices() {
0 commit comments