Skip to content

Commit ed67bcb

Browse files
ClarkChin08airMeng
andauthored
[SYCL] fix multi-gpu issue on sycl (#8554)
--------- Signed-off-by: Chen Xi <[email protected]> Co-authored-by: Meng, Hengyu <[email protected]>
1 parent eddcb52 commit ed67bcb

File tree

4 files changed

+102
-41
lines changed

4 files changed

+102
-41
lines changed

docs/backend/SYCL.md

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
293293
```sh
294294
./build/bin/llama-ls-sycl-device
295295
```
296-
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
296+
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
297297
```
298-
found 6 SYCL devices:
298+
found 2 SYCL devices:
299+
299300
| | | |Compute |Max compute|Max work|Max sub| |
300301
|ID| Device Type| Name|capability|units |group |group |Global mem size|
301302
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
302303
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
303304
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
304-
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
305-
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
306-
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
307-
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
308305
```
309306

310-
| Attribute | Note |
311-
|------------------------|-------------------------------------------------------------|
312-
| compute capability 1.3 | Level-zero driver/runtime, recommended |
313-
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
314307

315308
4. Launch inference
316309

317310
There are two device selection modes:
318311

319312
- Single device: Use one device target specified by the user.
320-
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
313+
- Multiple devices: Automatically choose the devices with the same backend.
314+
315+
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
321316

322317
| Device selection | Parameter |
323318
|------------------|----------------------------------------|
@@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
474469
build\bin\ls-sycl-device.exe
475470
```
476471

477-
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
472+
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
478473
```
479-
found 6 SYCL devices:
474+
found 2 SYCL devices:
480475
| | | |Compute |Max compute|Max work|Max sub| |
481476
|ID| Device Type| Name|capability|units |group |group |Global mem size|
482477
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
483478
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
484479
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
485-
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
486-
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
487-
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
488-
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
489480
490481
```
491482

492-
| Attribute | Note |
493-
|------------------------|-----------------------------------------------------------|
494-
| compute capability 1.3 | Level-zero running time, recommended |
495-
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
496-
497483

498484
4. Launch inference
499485

500486
There are two device selection modes:
501487

502-
- Single device: Use one device assigned by user.
503-
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
488+
- Single device: Use one device assigned by user. Default device id is 0.
489+
- Multiple devices: Automatically choose the devices with the same backend.
490+
491+
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
504492

505493
| Device selection | Parameter |
506494
|------------------|----------------------------------------|

ggml/src/ggml-sycl/common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
267267

268268
queue_ptr stream(int device, int stream) {
269269
if (qptrs[device][stream] == nullptr) {
270-
qptrs[device][stream] = &(dpct::get_current_device().default_queue());
270+
qptrs[device][stream] = &(dpct::get_device(device).default_queue());
271271
}
272272
return qptrs[device][stream];
273273
}

ggml/src/ggml-sycl/dpct/helper.hpp

Lines changed: 88 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,7 @@ namespace dpct
588588
out = prop;
589589
}
590590
591-
/// dpct device extension
591+
/// dpct device extension
592592
class device_ext : public sycl::device {
593593
typedef std::mutex mutex_type;
594594
@@ -697,7 +697,7 @@ namespace dpct
697697
std::unique_lock<mutex_type> lock(m_mutex);
698698
lock.unlock();
699699
for (auto &q : _queues) {
700-
q.wait_and_throw();
700+
q.wait_and_throw();
701701
}
702702
// Guard the destruct of current_queues to make sure the ref count is
703703
// safe.
@@ -734,7 +734,12 @@ namespace dpct
734734
735735
void destroy_queue(sycl::queue queue) {
736736
std::lock_guard<mutex_type> lock(m_mutex);
737-
_queues.clear();
737+
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
738+
[=](const sycl::queue &q) -> bool
739+
{
740+
return q == queue;
741+
}),
742+
_queues.end());
738743
}
739744
void set_saved_queue(sycl::queue q) {
740745
std::lock_guard<mutex_type> lock(m_mutex);
@@ -764,13 +769,13 @@ namespace dpct
764769
if (enable_exception_handler) {
765770
eh = exception_handler;
766771
}
767-
auto q = sycl::queue(*this, eh,
768-
sycl::property_list(
772+
_queues.push_back(sycl::queue(
773+
*this, eh,
774+
sycl::property_list(
769775
#ifdef DPCT_PROFILING_ENABLED
770-
sycl::property::queue::enable_profiling(),
776+
sycl::property::queue::enable_profiling(),
771777
#endif
772-
properties...));
773-
_queues.push_back(q);
778+
properties...)));
774779
775780
return _queues.back();
776781
}
@@ -783,8 +788,8 @@ namespace dpct
783788
if (enable_exception_handler) {
784789
eh = exception_handler;
785790
}
786-
_queues.push_back(
787-
sycl::queue(device, eh,
791+
_queues.push_back(sycl::queue(
792+
device, eh,
788793
sycl::property_list(
789794
#ifdef DPCT_PROFILING_ENABLED
790795
sycl::property::queue::enable_profiling(),
@@ -855,15 +860,75 @@ namespace dpct
855860
unsigned int get_device_id(const sycl::device &dev)
856861
{
857862
unsigned int id = 0;
858-
for (auto dev_item : _devs)
863+
for (auto &dev_item : _devs)
859864
{
860865
if (*dev_item == dev)
861866
{
862-
break;
867+
return id;
863868
}
864869
id++;
865870
}
866-
return id;
871+
return -1;
872+
}
873+
874+
inline std::string get_preferred_gpu_platform_name() {
875+
std::string result;
876+
877+
std::string filter = "level-zero";
878+
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
879+
if (env) {
880+
if (std::strstr(env, "level_zero")) {
881+
filter = "level-zero";
882+
}
883+
else if (std::strstr(env, "opencl")) {
884+
filter = "opencl";
885+
}
886+
else if (std::strstr(env, "cuda")) {
887+
filter = "cuda";
888+
}
889+
else if (std::strstr(env, "hip")) {
890+
filter = "hip";
891+
}
892+
else {
893+
throw std::runtime_error("invalid device filter: " + std::string(env));
894+
}
895+
}
896+
897+
auto plaform_list = sycl::platform::get_platforms();
898+
899+
for (const auto& platform : plaform_list) {
900+
auto devices = platform.get_devices();
901+
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
902+
return d.is_gpu();
903+
});
904+
905+
if (gpu_dev == devices.end()) {
906+
// cout << "platform [" << platform_name
907+
// << "] does not contain GPU devices, skipping\n";
908+
continue;
909+
}
910+
911+
auto platform_name = platform.get_info<sycl::info::platform::name>();
912+
std::string platform_name_low_case;
913+
platform_name_low_case.resize(platform_name.size());
914+
915+
std::transform(
916+
platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
917+
918+
if (platform_name_low_case.find(filter) == std::string::npos) {
919+
// cout << "platform [" << platform_name
920+
// << "] does not match with requested "
921+
// << filter << ", skipping\n";
922+
continue;
923+
}
924+
925+
result = platform_name;
926+
}
927+
928+
if (result.empty())
929+
throw std::runtime_error("can not find preferred GPU platform");
930+
931+
return result;
867932
}
868933
869934
template <class DeviceSelector>
@@ -930,10 +995,15 @@ namespace dpct
930995
// Keep track of the number of devices per backend
931996
std::map<sycl::backend, size_t> DeviceNums;
932997
std::map<std::string, std::vector<sycl::device>> backend_devices;
998+
auto preferred_platform_name = get_preferred_gpu_platform_name();
933999
9341000
while (!Platforms.empty()) {
9351001
auto Platform = Platforms.back();
9361002
Platforms.pop_back();
1003+
auto platform_name = Platform.get_info<sycl::info::platform::name>();
1004+
if (platform_name.compare(preferred_platform_name) != 0) {
1005+
continue;
1006+
}
9371007
auto devices = Platform.get_devices();
9381008
std::string backend_type = get_device_backend_and_type(devices[0]);
9391009
for (const auto &device : devices) {
@@ -1989,6 +2059,11 @@ namespace dpct
19892059
return dev_mgr::instance().current_device();
19902060
}
19912061

2062+
static inline device_ext &get_device(unsigned int id)
2063+
{
2064+
return dev_mgr::instance().get_device(id);
2065+
}
2066+
19922067
static inline sycl::queue &get_in_order_queue()
19932068
{
19942069
return dev_mgr::instance().current_device().in_order_queue();

src/llama.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16643,9 +16643,7 @@ struct llama_context * llama_new_context_with_model(
1664316643
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
1664416644
ggml_backend_t backend = ggml_backend_sycl_init(i);
1664516645
if (backend == nullptr) {
16646-
int id_list[GGML_SYCL_MAX_DEVICES];
16647-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
16648-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
16646+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
1664916647
llama_free(ctx);
1665016648
return nullptr;
1665116649
}

0 commit comments

Comments
 (0)