Skip to content

Commit 2023aa3

Browse files
committed
ucx: check specific devices to determine priority
Signed-off-by: Yossi Itigin <[email protected]>
1 parent 723c6b7 commit 2023aa3

File tree

4 files changed

+159
-46
lines changed

4 files changed

+159
-46
lines changed

contrib/platform/mellanox/optimized.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ coll = ^ml
6262
hwloc_base_binding_policy = core
6363
btl = self
6464
pml_ucx_tls = any
65+
pml_ucx_devices = any
6566
# Basic behavior to smooth startup
6667
mca_base_component_show_load_errors = 0
6768
orte_abort_timeout = 10

ompi/mca/pml/ucx/pml_ucx_component.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,17 +92,26 @@ static mca_pml_base_module_t*
9292
mca_pml_ucx_component_init(int* priority, bool enable_progress_threads,
9393
bool enable_mpi_threads)
9494
{
95+
opal_common_ucx_support_level_t support_level;
9596
int ret;
9697

97-
if (!opal_common_ucx_is_enabled(ompi_pml_ucx.ucp_context)) {
98+
support_level = opal_common_ucx_support_level(ompi_pml_ucx.ucp_context);
99+
if (support_level == OPAL_COMMON_UCX_SUPPORT_NONE) {
98100
return NULL;
99101
}
100102

101103
if ( (ret = mca_pml_ucx_init(enable_mpi_threads)) != 0) {
102104
return NULL;
103105
}
104106

105-
*priority = ompi_pml_ucx.priority;
107+
/*
108+
* If found supported devices - set to the configured (high) priority.
109+
* Otherwise - Found only supported transports (which could be exposed by
110+
* unsupported devices), so set a priority lower than ob1.
111+
*/
112+
*priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ?
113+
ompi_pml_ucx.priority : 19;
114+
PML_UCX_VERBOSE(2, "returning priority %d", *priority);
106115
return &ompi_pml_ucx.super;
107116
}
108117

opal/mca/common/ucx/common_ucx.c

Lines changed: 133 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length,
4343

4444
OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component)
4545
{
46-
static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc";
46+
static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc";
47+
static const char *default_devices = "mlx*";
4748
static int registered = 0;
4849
static int hook_index;
4950
static int verbose_index;
5051
static int progress_index;
5152
static int tls_index;
53+
static int devices_index;
5254

5355
if (!registered) {
5456
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
@@ -74,14 +76,25 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
7476
opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
7577
*opal_common_ucx.tls = strdup(default_tls);
7678
tls_index = mca_base_var_register("opal", "opal_common", "ucx", "tls",
77-
"List of transports which enable using UCX component. Special "
78-
"values: any (any available). A '^' prefix negates the list. "
79-
"In order to exclude on shared memory and TCP transports, "
79+
"List of UCX transports which should be supported on the system, to enable "
80+
"selecting the UCX component. Special values: any (any available). "
81+
"A '^' prefix negates the list. "
82+
"For example, in order to exclude on shared memory and TCP transports, "
8083
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
8184
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
8285
OPAL_INFO_LVL_3,
8386
MCA_BASE_VAR_SCOPE_LOCAL,
8487
opal_common_ucx.tls);
88+
89+
opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
90+
*opal_common_ucx.devices = strdup(default_devices);
91+
devices_index = mca_base_var_register("opal", "opal_common", "ucx", "devices",
92+
"List of device driver pattern names, which, if supported by UCX, will "
93+
"bump its priority above ob1. Special values: any (any available)",
94+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
95+
OPAL_INFO_LVL_3,
96+
MCA_BASE_VAR_SCOPE_LOCAL,
97+
opal_common_ucx.devices);
8598
registered = 1;
8699
}
87100
if (component) {
@@ -101,6 +114,10 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
101114
component->mca_type_name,
102115
component->mca_component_name,
103116
"tls", 0);
117+
mca_base_var_register_synonym(devices_index, component->mca_project_name,
118+
component->mca_type_name,
119+
component->mca_component_name,
120+
"devices", 0);
104121
}
105122
}
106123

@@ -149,91 +166,164 @@ OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void)
149166
opal_output_close(opal_common_ucx.output);
150167
}
151168

152-
OPAL_DECLSPEC int opal_common_ucx_is_enabled(ucp_context_h context)
169+
#if HAVE_DECL_OPEN_MEMSTREAM
170+
static bool opal_common_ucx_check_device(const char *device_name, char **device_list)
153171
{
172+
char sysfs_driver_link[PATH_MAX];
173+
char driver_path[PATH_MAX];
174+
char *ib_device_name;
175+
char *driver_name;
176+
char **list_item;
177+
ssize_t ret;
178+
179+
/* mlx5_0:1 */
180+
ret = sscanf(device_name, "%m[^:]%*d", &ib_device_name);
181+
if (ret != 1) {
182+
return false;
183+
}
184+
185+
sysfs_driver_link[sizeof(sysfs_driver_link) - 1] = '\0';
186+
snprintf(sysfs_driver_link, sizeof(sysfs_driver_link) - 1,
187+
"/sys/class/infiniband/%s/device/driver", ib_device_name);
188+
free(ib_device_name);
189+
190+
driver_path[sizeof(driver_path) - 1] = '\0';
191+
ret = readlink(sysfs_driver_link, driver_path, sizeof(driver_path) - 1);
192+
if (ret < 0) {
193+
MCA_COMMON_UCX_VERBOSE(2, "readlink(%s) failed: %s", sysfs_driver_link,
194+
strerror(errno));
195+
return false;
196+
}
197+
198+
driver_name = basename(driver_path);
199+
for (list_item = device_list; *list_item != NULL; ++list_item) {
200+
if (!fnmatch(*list_item, driver_name, 0)) {
201+
MCA_COMMON_UCX_VERBOSE(2, "driver '%s' matched by '%s'",
202+
driver_path, *list_item);
203+
return true;
204+
}
205+
}
206+
207+
return false;
208+
}
209+
#endif
210+
211+
OPAL_DECLSPEC opal_common_ucx_support_level_t
212+
opal_common_ucx_support_level(ucp_context_h context)
213+
{
214+
opal_common_ucx_support_level_t support_level = OPAL_COMMON_UCX_SUPPORT_NONE;
215+
static const char *support_level_names[] = {
216+
[OPAL_COMMON_UCX_SUPPORT_NONE] = "none",
217+
[OPAL_COMMON_UCX_SUPPORT_TRANSPORT] = "transports only",
218+
[OPAL_COMMON_UCX_SUPPORT_DEVICE] = "transports and devices"
219+
};
154220
#if HAVE_DECL_OPEN_MEMSTREAM
155-
char needle[64], line[128], *ptr;
156-
char **tl_list, **tl_name;
157-
int found, negate, enable;
221+
char *rsc_tl_name, *rsc_device_name;
222+
char **tl_list, **device_list, **list_item;
223+
bool is_any_tl, is_any_device;
224+
bool found_tl, negate;
225+
char line[128];
158226
FILE *stream;
159227
char *buffer;
160228
size_t size;
229+
int ret;
161230
#endif
162231

232+
is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
233+
is_any_device = !strcmp(*opal_common_ucx.devices, "any");
234+
163235
/* Check for special value "any" */
164-
if (!strcmp(*opal_common_ucx.tls, "any")) {
165-
MCA_COMMON_UCX_VERBOSE(1, "ucx is enabled on any transport",
236+
if (is_any_tl && is_any_device) {
237+
MCA_COMMON_UCX_VERBOSE(1, "ucx is enabled on any transport or device",
166238
*opal_common_ucx.tls);
167-
enable = 1;
239+
support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE;
168240
goto out;
169241
}
170242

171243
#if HAVE_DECL_OPEN_MEMSTREAM
172-
enable = 0;
244+
/* Split transports list */
173245
negate = ('^' == (*opal_common_ucx.tls)[0]);
174-
tl_list = opal_argv_split(*opal_common_ucx.tls + negate, ',');
246+
tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
175247
if (tl_list == NULL) {
176248
MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
177249
*opal_common_ucx.tls);
178250
goto out;
179251
}
180252

253+
/* Split devices list */
254+
device_list = opal_argv_split(*opal_common_ucx.devices, ',');
255+
if (device_list == NULL) {
256+
MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
257+
*opal_common_ucx.devices);
258+
goto out_free_tl_list;
259+
}
260+
261+
/* Open memory stream to dump UCX information to */
181262
stream = open_memstream(&buffer, &size);
182263
if (stream == NULL) {
183264
MCA_COMMON_UCX_VERBOSE(1, "failed to open memory stream for ucx info (%s), "
184265
"ucx is disabled", strerror(errno));
185-
goto out_free_tl_list;
266+
goto out_free_device_list;
186267
}
187268

188-
/* print ucx transports information to a memory stream */
269+
/* Print ucx transports information to the memory stream */
189270
ucp_context_print_info(context, stream);
190271

191-
/* rewind and read transports list from the stream */
272+
/* Rewind and read transports/devices list from the stream */
192273
fseek(stream, 0, SEEK_SET);
193-
while (!enable && (fgets(line, sizeof(line), stream) != NULL)) {
194-
/* match actual transports (not connection managers)
195-
e.g "resource 6 : md 5 dev 4 flags -- rc_verbs/mlx5_0:1" */
196-
if (fnmatch("#* resource * flags -- */*", line, 0)) {
274+
while ((support_level != OPAL_COMMON_UCX_SUPPORT_DEVICE) &&
275+
(fgets(line, sizeof(line), stream) != NULL)) {
276+
rsc_tl_name = NULL;
277+
ret = sscanf(line,
278+
/* "# resource 6 : md 5 dev 4 flags -- rc_verbs/mlx5_0:1" */
279+
"# resource %*d : md %*d dev %*d flags -- %m[^/ \n\r]/%m[^/ \n\r]",
280+
&rsc_tl_name, &rsc_device_name);
281+
if (ret != 2) {
282+
free(rsc_tl_name);
197283
continue;
198284
}
199-
line[strcspn(line, "\r\n")] = '\0'; /* remove trailing newline */
200-
201-
/* Check if any of the transports in the provided list is found in the
202-
current info output line */
203-
found = 0;
204-
for (tl_name = tl_list; !found && (*tl_name != NULL); ++tl_name) {
205-
snprintf(needle, sizeof(needle), "%s/", *tl_name);
206-
ptr = strstr(line, needle);
207-
if (ptr != NULL) {
208-
MCA_COMMON_UCX_VERBOSE(2, "%scluded transport '%s' as '%s'",
209-
negate ? "skip ex" : "found in",
210-
*tl_name, ptr);
211-
found = 1;
212-
}
285+
286+
/* Check if 'rsc_tl_name' is found provided list */
287+
found_tl = is_any_tl;
288+
for (list_item = tl_list; !found_tl && (*list_item != NULL); ++list_item) {
289+
found_tl = !strcmp(*list_item, rsc_tl_name);
213290
}
214291

215-
if (found) {
216-
/* found a transport: enable if it's an include-list */
217-
enable = !negate;
292+
/* Check if the transport has a match (either positive or negative) */
293+
assert(!(is_any_tl && negate));
294+
if (found_tl != negate) {
295+
if (is_any_device ||
296+
opal_common_ucx_check_device(rsc_device_name, device_list)) {
297+
MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched both transport and device list",
298+
rsc_tl_name, rsc_device_name);
299+
support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE;
300+
} else {
301+
MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched transport list but not device list",
302+
rsc_tl_name, rsc_device_name);
303+
support_level = OPAL_COMMON_UCX_SUPPORT_TRANSPORT;
304+
}
218305
} else {
219-
/* did not find a transport: enable if it's an exclude-list */
220-
MCA_COMMON_UCX_VERBOSE(2, "line '%s' is not matched", line);
221-
enable = negate;
306+
MCA_COMMON_UCX_VERBOSE(2, "%s/%s: did not match transport list",
307+
rsc_tl_name, rsc_device_name);
222308
}
309+
310+
free(rsc_device_name);
311+
free(rsc_tl_name);
223312
}
224313

225-
MCA_COMMON_UCX_VERBOSE(2, "ucx is %sabled", enable ? "en" : "dis");
314+
MCA_COMMON_UCX_VERBOSE(2, "support level is %s", support_level_names[support_level]);
226315
fclose(stream);
227316
free(buffer);
228317

318+
out_free_device_list:
319+
opal_argv_free(device_list);
229320
out_free_tl_list:
230321
opal_argv_free(tl_list);
231322
out:
232-
return enable;
233323
#else
234324
MCA_COMMON_UCX_VERBOSE(2, "open_memstream() was not found, ucx is disabled");
235-
return 0;
236325
#endif
326+
return support_level;
237327
}
238328

239329
void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status)

opal/mca/common/ucx/common_ucx.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,18 +97,31 @@ typedef struct opal_common_ucx_module {
9797
int registered;
9898
bool opal_mem_hooks;
9999
char **tls;
100+
char **devices;
100101
} opal_common_ucx_module_t;
101102

102103
typedef struct opal_common_ucx_del_proc {
103104
ucp_ep_h ep;
104105
size_t vpid;
105106
} opal_common_ucx_del_proc_t;
106107

108+
typedef enum {
109+
/* No supported transports found (according to configured list of supported
110+
transports) */
111+
OPAL_COMMON_UCX_SUPPORT_NONE,
112+
113+
/* Have supported transports but not supported devices */
114+
OPAL_COMMON_UCX_SUPPORT_TRANSPORT,
115+
116+
/* Have both supported transports and supported devices */
117+
OPAL_COMMON_UCX_SUPPORT_DEVICE,
118+
} opal_common_ucx_support_level_t;
119+
107120
extern opal_common_ucx_module_t opal_common_ucx;
108121

109122
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
110123
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
111-
OPAL_DECLSPEC int opal_common_ucx_is_enabled(ucp_context_h context);
124+
OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_context_h context);
112125
OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void);
113126
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
114127
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);

0 commit comments

Comments
 (0)