Skip to content

Commit cae9fb4

Browse files
authored
HIP: Only call rocblas_initialize on rocblas versions with the multiple instantation bug (#11080)
This disables the workaround on rocblas fixed versions (>=4.0.0) to eliminate the runtime cost and unnecessary VRAM allocation of loading all tensile objects.
1 parent 7fee288 commit cae9fb4

File tree

1 file changed

+20
-2
lines changed

1 file changed

+20
-2
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <algorithm>
4343
#include <array>
4444
#include <atomic>
45+
#include <charconv>
4546
#include <cinttypes>
4647
#include <cstddef>
4748
#include <cstdint>
@@ -172,8 +173,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
172173
#ifdef __HIP_PLATFORM_AMD__
173174
// Workaround for a rocBLAS bug when using multiple graphics cards:
174175
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
175-
rocblas_initialize();
176-
CUDA_CHECK(cudaDeviceSynchronize());
176+
{
177+
int major_version = 0;
178+
size_t version_length = 0;
179+
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
180+
std::string version(version_length, '\0');
181+
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
182+
version.resize(::strlen(version.c_str()));
183+
int parsed_value = 0;
184+
if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
185+
major_version = parsed_value;
186+
}
187+
}
188+
}
189+
if (major_version < 4) {
190+
GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
191+
rocblas_initialize();
192+
CUDA_CHECK(cudaDeviceSynchronize());
193+
}
194+
}
177195
#endif
178196

179197
ggml_cuda_device_info info = {};

0 commit comments

Comments
 (0)