@@ -78,6 +78,9 @@ static int g_work_group_size = 0;
7878#define GGML_SYCL_MMV_Y 1
7979#endif
8080
81+ typedef sycl::queue *queue_ptr;
82+ typedef sycl::handler *handle_ptr;
83+
8184enum ggml_sycl_backend_gpu_mode {
8285 SYCL_UNSET_GPU_MODE = -1 ,
8386 SYCL_SINGLE_GPU_MODE = 0 ,
@@ -182,17 +185,6 @@ static_assert(
182185#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
183186
184187#define MUL_MAT_SRC1_COL_STRIDE 128
185- #define MAX_STREAMS 8
186- #define SYCL_MAX_DEVICES 48
187-
188- static dpct::queue_ptr g_syclStreams[SYCL_MAX_DEVICES][MAX_STREAMS] = {{0 }};
189-
190- struct ggml_tensor_extra_gpu {
191- void * data_device[SYCL_MAX_DEVICES]; // 1 pointer for each device for split
192- // tensors
193- dpct::event_ptr events[SYCL_MAX_DEVICES]
194- [MAX_STREAMS]; // events for synchronizing multiple GPUs
195- };
196188
197189class sycl_gpu_mgr {
198190 public:
@@ -320,7 +312,7 @@ class sycl_gpu_mgr {
320312 }
321313};
322314
323- static sycl_gpu_mgr* g_sycl_gpu_mgr = NULL ;
315+ static sycl_gpu_mgr* g_sycl_gpu_mgr = new sycl_gpu_mgr( 0 ) ;
324316static int g_device_count = -1 ;
325317static int g_all_sycl_device_count = -1 ;
326318static int g_main_device = -1 ;
@@ -329,31 +321,15 @@ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
329321
330322static std::array<float , SYCL_MAX_DEVICES> g_default_tensor_split = {};
331323
332- static float g_tensor_split[SYCL_MAX_DEVICES ] = {0 };
324+ static float g_tensor_split[GGML_SYCL_MAX_DEVICES ] = {0 };
333325
334326static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
335327 SYCL_UNSET_GPU_MODE;
336328
337- struct sycl_device_capabilities {
338- int cc; // compute capability
339- bool vmm; // virtual memory support
340- size_t vmm_granularity; // granularity of virtual memory
341- int device_id;
342- };
343-
344- static sycl_device_capabilities g_device_caps[SYCL_MAX_DEVICES] = {
345- {0 , false , 0 , -1 }};
346-
347- struct sycl_device_id2index {
348- int index;
349- };
350-
351329static void * g_scratch_buffer = nullptr ;
352330static size_t g_scratch_size = 0 ; // disabled by default
353331static size_t g_scratch_offset = 0 ;
354332
355- static dpct::queue_ptr g_sycl_handles[SYCL_MAX_DEVICES] = {nullptr };
356-
357333int get_main_device ();
358334
359335[[noreturn]] static inline void bad_arch (const sycl::stream& stream_ct1) {
@@ -427,25 +403,151 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
427403 std::exit (1 );
428404}
429405
430- void log_ggml_var_device (
431- const char * name,
432- float * src,
433- size_t total_elements,
434- bool src_on_device);
435-
436- void log_ggml_var_device_fp16 (
437- const char * name,
438- sycl::half* src,
439- size_t total_elements,
440- bool src_on_device);
441-
442- // todo: debug for crash in some case
443- void print_ggml_tensor (const char * name, struct ggml_tensor * src);
444-
445- static int log_file_name_idx = 0 ;
446- void log_tensor_with_cnt (
447- const char * name,
448- struct ggml_tensor * src,
449- int stop_cnt);
406+ // ////////////////////
407+
408+ struct ggml_sycl_device_info {
409+ int device_count;
410+
411+ struct sycl_device_info {
412+ int cc; // compute capability
413+ // int nsm; // number of streaming multiprocessors
414+ // size_t smpb; // max. shared memory per block
415+ bool vmm; // virtual memory support
416+ size_t total_vram;
417+ };
418+
419+ sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
420+
421+ std::array<float , GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
422+ };
423+
424+ const ggml_sycl_device_info & ggml_sycl_info ();
425+
426+ struct ggml_sycl_pool {
427+ virtual ~ggml_sycl_pool () = default ;
428+
429+ virtual void * alloc (size_t size, size_t * actual_size) = 0;
430+ virtual void free (void * ptr, size_t size) = 0;
431+ };
432+
433+ template <typename T>
434+ struct ggml_sycl_pool_alloc {
435+ ggml_sycl_pool * pool = nullptr ;
436+ T * ptr = nullptr ;
437+ size_t actual_size = 0 ;
438+
439+ explicit ggml_sycl_pool_alloc (ggml_sycl_pool & pool) : pool(&pool) {
440+ }
441+
442+ ggml_sycl_pool_alloc (ggml_sycl_pool & pool, size_t size) : pool(&pool) {
443+ alloc (size);
444+ }
445+
446+ ~ggml_sycl_pool_alloc () {
447+ if (ptr != nullptr ) {
448+ pool->free (ptr, actual_size);
449+ }
450+ }
451+
452+ // size is in number of elements
453+ T * alloc (size_t size) {
454+ GGML_ASSERT (pool != nullptr );
455+ GGML_ASSERT (ptr == nullptr );
456+ ptr = (T *) pool->alloc (size * sizeof (T), &this ->actual_size );
457+ return ptr;
458+ }
459+
460+ T * alloc (ggml_sycl_pool & pool, size_t size) {
461+ this ->pool = &pool;
462+ return alloc (size);
463+ }
464+
465+ T * get () {
466+ return ptr;
467+ }
468+
469+ ggml_sycl_pool_alloc () = default ;
470+ ggml_sycl_pool_alloc (const ggml_sycl_pool_alloc &) = delete ;
471+ ggml_sycl_pool_alloc (ggml_sycl_pool_alloc &&) = delete ;
472+ ggml_sycl_pool_alloc& operator =(const ggml_sycl_pool_alloc &) = delete ;
473+ ggml_sycl_pool_alloc& operator =(ggml_sycl_pool_alloc &&) = delete ;
474+ };
475+
476+ // backend interface
477+
478+ struct ggml_tensor_extra_gpu {
479+ void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
480+ // tensors
481+ dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
482+ [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
483+ };
484+
485+ struct ggml_backend_sycl_context {
486+ int device;
487+ std::string name;
488+
489+ queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
490+ static sycl::handler * sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr };
491+
492+ explicit ggml_backend_sycl_context (int device) :
493+ device(device),
494+ name(GGML_SYCL_NAME + std::to_string(device)) {
495+ }
496+
497+ ~ggml_backend_sycl_context () {
498+ for (int i = 0 ; i < GGML_SYCL_MAX_DEVICES; ++i) {
499+ for (int j = 0 ; j < GGML_SYCL_MAX_STREAMS; ++j) {
500+ if (qptrs[i][j] != nullptr ) {
501+ SYCL_CHECK (free (qptrs[i][j]));
502+ }
503+ }
504+ if (cublas_handles[i] != nullptr ) {
505+ SYCL_CHECK (free (sycl_handles[i]));
506+ }
507+ }
508+ }
509+
510+ queue_ptr stream (int device, int stream) {
511+ if (qptrs[device][stream] == nullptr ) {
512+ SYCL_CHECK (dpct::get_current_device ().create_queue (
513+ g_sycl_gpu_mgr->get_co_ctx (), dpct::get_current_device ())));
514+ }
515+ return qptrs[device][stream];
516+ }
517+
518+ queue_ptr stream () {
519+ return stream (device, 0 );
520+ }
521+
522+ handle_ptr sycl_handle (int device) {
523+ if (sycl_handles[device] == nullptr ) {
524+ const dpct::queue_ptr stream = qptrs[device][0 ];
525+ // create sycl handle
526+ SYCL_CHECK (CHECK_TRY_ERROR (sycl_handles[device] = stream));
527+ }
528+ return sycl_handles[device];
529+ }
530+
531+ handle_ptr sycl_handle () {
532+ return sycl_handle (device);
533+ }
534+
535+ // pool
536+ std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
537+
538+ static std::unique_ptr<ggml_sycl_pool> new_pool_for_device (queue_ptr qptr, int device);
539+
540+ ggml_sycl_pool & pool (int device) {
541+ if (pools[device] == nullptr ) {
542+ pools[device] = new_pool_for_device (qptrs[device][0 ], device);
543+ }
544+ return *pools[device];
545+ }
546+
547+ ggml_sycl_pool & pool () {
548+ return pool (device);
549+ }
550+ };
551+
450552
451553#endif // GGML_SYCL_COMMON_HPP
0 commit comments