Skip to content

Commit beb69e2

Browse files
committed
common/ucx: fix variable registration
The registration code in common/ucx was incredibly buggy and will not work as written with MPI_T or Sessions. The code registered its variables once despite the fact that there are valid code paths where they may get unregistered and need to be registered again. This was apparently done to avoid re-registration. This is completely unnecesary. This commit removes the protection around re-registration and simply ensures that the strings are only set to their defaults if they have been reset to NULL (which happens on deregistration). I also cleaned up a memory leak where the component was allocating an extra pointer to hold the strings (??!). Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 3990b3f commit beb69e2

File tree

2 files changed

+63
-95
lines changed

2 files changed

+63
-95
lines changed

opal/mca/common/ucx/common_ucx.c

Lines changed: 61 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED.
34
* Copyright (c) 2019 Intel, Inc. All rights reserved.
45
* Copyright (c) 2019 Research Organization for Information Science
56
* and Technology (RIST). All rights reserved.
67
* Copyright (c) 2021 Triad National Security, LLC. All rights
78
* reserved.
9+
* Copyright (c) 2022 Google, LLC. All rights reserved.
810
*
911
* $COPYRIGHT$
1012
*
@@ -33,13 +35,11 @@ static int use_safety_valve = 0;
3335

3436
extern mca_base_framework_t opal_memory_base_framework;
3537

36-
opal_common_ucx_module_t opal_common_ucx = {.verbose = 0,
37-
.progress_iterations = 100,
38-
.registered = 0,
39-
.opal_mem_hooks = 1,
40-
.tls = NULL};
41-
42-
static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT;
38+
opal_common_ucx_module_t opal_common_ucx =
39+
{
40+
.progress_iterations = 100,
41+
.opal_mem_hooks = 1,
42+
};
4343

4444
static void opal_common_ucx_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc)
4545
{
@@ -48,87 +48,57 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length, void *cbdat
4848

4949
OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component)
5050
{
51-
static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,ud_mlx5,cuda_ipc,rocm_ipc";
52-
static const char *default_devices = "mlx*";
53-
static int hook_index;
54-
static int verbose_index;
55-
static int progress_index;
56-
static int tls_index;
57-
static int devices_index;
58-
int param;
59-
60-
OPAL_THREAD_LOCK(&opal_common_ucx_mutex);
61-
62-
param = mca_base_var_find("opal", "opal_common", "ucx", "verbose");
63-
if (0 > param) {
64-
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
65-
"Verbose level of the UCX components",
66-
MCA_BASE_VAR_TYPE_INT, NULL, 0,
67-
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
68-
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
69-
}
70-
71-
param = mca_base_var_find("opal", "opal_common", "ucx", "progress_iterations");
72-
if (0 > param) {
73-
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
74-
"Set number of calls of internal UCX progress "
75-
"calls per opal_progress call",
76-
MCA_BASE_VAR_TYPE_INT, NULL, 0,
77-
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
78-
MCA_BASE_VAR_SCOPE_LOCAL,
79-
&opal_common_ucx.progress_iterations);
80-
}
81-
82-
param = mca_base_var_find("opal", "opal_common", "ucx", "opal_mem_hooks");
83-
if (0 > param) {
84-
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
85-
"Use OPAL memory hooks, instead of UCX internal "
86-
"memory hooks",
87-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
51+
char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,ud_mlx5,cuda_ipc,rocm_ipc";
52+
char *default_devices = "mlx*";
53+
int hook_index;
54+
int verbose_index;
55+
int progress_index;
56+
int tls_index;
57+
int devices_index;
58+
59+
/* It is harmless to re-register variables so go ahead an re-register. */
60+
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
61+
"Verbose level of the UCX components",
62+
MCA_BASE_VAR_TYPE_INT, NULL, 0,
63+
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
64+
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
65+
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
66+
"Set number of calls of internal UCX progress "
67+
"calls per opal_progress call",
68+
MCA_BASE_VAR_TYPE_INT, NULL, 0,
69+
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
8870
MCA_BASE_VAR_SCOPE_LOCAL,
89-
&opal_common_ucx.opal_mem_hooks);
71+
&opal_common_ucx.progress_iterations);
72+
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
73+
"Use OPAL memory hooks, instead of UCX internal "
74+
"memory hooks",
75+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
76+
MCA_BASE_VAR_SCOPE_LOCAL,
77+
&opal_common_ucx.opal_mem_hooks);
78+
79+
if (NULL == opal_common_ucx.tls) {
80+
opal_common_ucx.tls = default_tls;
9081
}
9182

92-
param = mca_base_var_find("opal", "opal_common", "ucx", "tls");
93-
if (0 > param) {
94-
95-
/*
96-
* this monkey business is needed because of the way the MCA VARs framework tries to handle pointers to strings
97-
* when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
98-
* to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
99-
* Same for the devices pointer below.
100-
*/
101-
if (NULL == opal_common_ucx.tls) {
102-
opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
103-
assert(NULL != opal_common_ucx.tls);
104-
}
105-
*opal_common_ucx.tls = strdup(default_tls);
106-
tls_index = mca_base_var_register(
107-
"opal", "opal_common", "ucx", "tls",
108-
"List of UCX transports which should be supported on the system, to enable "
109-
"selecting the UCX component. Special values: any (any available). "
110-
"A '^' prefix negates the list. "
111-
"For example, in order to exclude on shared memory and TCP transports, "
112-
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
113-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
114-
opal_common_ucx.tls);
115-
}
116-
117-
param = mca_base_var_find("opal", "opal_common", "ucx", "devices");
118-
if (0 > param) {
119-
120-
if (NULL == opal_common_ucx.devices) {
121-
opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
122-
assert(NULL != opal_common_ucx.devices);
123-
}
124-
*opal_common_ucx.devices = strdup(default_devices);
125-
devices_index = mca_base_var_register(
126-
"opal", "opal_common", "ucx", "devices",
127-
"List of device driver pattern names, which, if supported by UCX, will "
128-
"bump its priority above ob1. Special values: any (any available)",
129-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
130-
opal_common_ucx.devices);
83+
tls_index = mca_base_var_register(
84+
"opal", "opal_common", "ucx", "tls",
85+
"List of UCX transports which should be supported on the system, to enable "
86+
"selecting the UCX component. Special values: any (any available). "
87+
"A '^' prefix negates the list. "
88+
"For example, in order to exclude on shared memory and TCP transports, "
89+
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
90+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
91+
&opal_common_ucx.tls);
92+
93+
if (NULL == opal_common_ucx.devices) {
94+
opal_common_ucx.devices = default_devices;
13195
}
96+
devices_index = mca_base_var_register(
97+
"opal", "opal_common", "ucx", "devices",
98+
"List of device driver pattern names, which, if supported by UCX, will "
99+
"bump its priority above ob1. Special values: any (any available)",
100+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
101+
&opal_common_ucx.devices);
132102

133103
if (component) {
134104
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
@@ -147,8 +117,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
147117
component->mca_type_name, component->mca_component_name,
148118
"devices", 0);
149119
}
150-
151-
OPAL_THREAD_UNLOCK(&opal_common_ucx_mutex);
152120
}
153121

154122
OPAL_DECLSPEC void opal_common_ucx_mca_register(void)
@@ -261,8 +229,8 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
261229
int ret;
262230
#endif
263231

264-
is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
265-
is_any_device = !strcmp(*opal_common_ucx.devices, "any");
232+
is_any_tl = !strcmp(opal_common_ucx.tls, "any");
233+
is_any_device = !strcmp(opal_common_ucx.devices, "any");
266234

267235
/* Check for special value "any" */
268236
if (is_any_tl && is_any_device) {
@@ -273,19 +241,19 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
273241

274242
#if HAVE_DECL_OPEN_MEMSTREAM
275243
/* Split transports list */
276-
negate = ('^' == (*opal_common_ucx.tls)[0]);
277-
tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
244+
negate = ('^' == (opal_common_ucx.tls)[0]);
245+
tl_list = opal_argv_split(opal_common_ucx.tls + (negate ? 1 : 0), ',');
278246
if (tl_list == NULL) {
279247
MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
280-
*opal_common_ucx.tls);
248+
opal_common_ucx.tls);
281249
goto out;
282250
}
283251

284252
/* Split devices list */
285-
device_list = opal_argv_split(*opal_common_ucx.devices, ',');
253+
device_list = opal_argv_split(opal_common_ucx.devices, ',');
286254
if (device_list == NULL) {
287255
MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
288-
*opal_common_ucx.devices);
256+
opal_common_ucx.devices);
289257
goto out_free_tl_list;
290258
}
291259

opal/mca/common/ucx/common_ucx.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ typedef struct opal_common_ucx_module {
9090
int progress_iterations;
9191
int registered;
9292
bool opal_mem_hooks;
93-
char **tls;
94-
char **devices;
93+
char *tls;
94+
char *devices;
9595
} opal_common_ucx_module_t;
9696

9797
typedef struct opal_common_ucx_del_proc {

0 commit comments

Comments
 (0)