Skip to content

common/ucx: fix variable registration memory leak and cleanup #10105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 60 additions & 86 deletions opal/mca/common/ucx/common_ucx.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED.
* Copyright (c) 2019 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2022 Google, LLC. All rights reserved.
*
* $COPYRIGHT$
*
Expand Down Expand Up @@ -33,11 +35,11 @@ static int use_safety_valve = 0;

extern mca_base_framework_t opal_memory_base_framework;

opal_common_ucx_module_t opal_common_ucx = {.verbose = 0,
.progress_iterations = 100,
.registered = 0,
.opal_mem_hooks = 1,
.tls = NULL};
opal_common_ucx_module_t opal_common_ucx =
{
.progress_iterations = 100,
.opal_mem_hooks = 1,
};

static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT;

Expand All @@ -48,87 +50,59 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length, void *cbdat

OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component)
{
static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,ud_mlx5,cuda_ipc,rocm_ipc";
static const char *default_devices = "mlx*";
static int hook_index;
static int verbose_index;
static int progress_index;
static int tls_index;
static int devices_index;
int param;
char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,ud_mlx5,cuda_ipc,rocm_ipc";
char *default_devices = "mlx*";
int hook_index;
int verbose_index;
int progress_index;
int tls_index;
int devices_index;

OPAL_THREAD_LOCK(&opal_common_ucx_mutex);

param = mca_base_var_find("opal", "opal_common", "ucx", "verbose");
if (0 > param) {
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
"Verbose level of the UCX components",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
}

param = mca_base_var_find("opal", "opal_common", "ucx", "progress_iterations");
if (0 > param) {
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
"Set number of calls of internal UCX progress "
"calls per opal_progress call",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.progress_iterations);
}

param = mca_base_var_find("opal", "opal_common", "ucx", "opal_mem_hooks");
if (0 > param) {
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
"Use OPAL memory hooks, instead of UCX internal "
"memory hooks",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
/* It is harmless to re-register variables so go ahead an re-register. */
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
"Verbose level of the UCX components",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
"Set number of calls of internal UCX progress "
"calls per opal_progress call",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.opal_mem_hooks);
&opal_common_ucx.progress_iterations);
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
"Use OPAL memory hooks, instead of UCX internal "
"memory hooks",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.opal_mem_hooks);

if (NULL == opal_common_ucx.tls) {
opal_common_ucx.tls = default_tls;
}

param = mca_base_var_find("opal", "opal_common", "ucx", "tls");
if (0 > param) {

/*
* this monkey business is needed because of the way the MCA VARs framework tries to handle pointers to strings
* when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
* to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
* Same for the devices pointer below.
*/
if (NULL == opal_common_ucx.tls) {
opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
assert(NULL != opal_common_ucx.tls);
}
*opal_common_ucx.tls = strdup(default_tls);
tls_index = mca_base_var_register(
"opal", "opal_common", "ucx", "tls",
"List of UCX transports which should be supported on the system, to enable "
"selecting the UCX component. Special values: any (any available). "
"A '^' prefix negates the list. "
"For example, in order to exclude on shared memory and TCP transports, "
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
opal_common_ucx.tls);
}

param = mca_base_var_find("opal", "opal_common", "ucx", "devices");
if (0 > param) {

if (NULL == opal_common_ucx.devices) {
opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
assert(NULL != opal_common_ucx.devices);
}
*opal_common_ucx.devices = strdup(default_devices);
devices_index = mca_base_var_register(
"opal", "opal_common", "ucx", "devices",
"List of device driver pattern names, which, if supported by UCX, will "
"bump its priority above ob1. Special values: any (any available)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
opal_common_ucx.devices);
tls_index = mca_base_var_register(
"opal", "opal_common", "ucx", "tls",
"List of UCX transports which should be supported on the system, to enable "
"selecting the UCX component. Special values: any (any available). "
"A '^' prefix negates the list. "
"For example, in order to exclude on shared memory and TCP transports, "
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.tls);

if (NULL == opal_common_ucx.devices) {
opal_common_ucx.devices = default_devices;
}
devices_index = mca_base_var_register(
"opal", "opal_common", "ucx", "devices",
"List of device driver pattern names, which, if supported by UCX, will "
"bump its priority above ob1. Special values: any (any available)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.devices);

if (component) {
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
Expand Down Expand Up @@ -261,8 +235,8 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
int ret;
#endif

is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
is_any_device = !strcmp(*opal_common_ucx.devices, "any");
is_any_tl = !strcmp(opal_common_ucx.tls, "any");
is_any_device = !strcmp(opal_common_ucx.devices, "any");

/* Check for special value "any" */
if (is_any_tl && is_any_device) {
Expand All @@ -273,19 +247,19 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_

#if HAVE_DECL_OPEN_MEMSTREAM
/* Split transports list */
negate = ('^' == (*opal_common_ucx.tls)[0]);
tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
negate = ('^' == (opal_common_ucx.tls)[0]);
tl_list = opal_argv_split(opal_common_ucx.tls + (negate ? 1 : 0), ',');
if (tl_list == NULL) {
MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
*opal_common_ucx.tls);
opal_common_ucx.tls);
goto out;
}

/* Split devices list */
device_list = opal_argv_split(*opal_common_ucx.devices, ',');
device_list = opal_argv_split(opal_common_ucx.devices, ',');
if (device_list == NULL) {
MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
*opal_common_ucx.devices);
opal_common_ucx.devices);
goto out_free_tl_list;
}

Expand Down
4 changes: 2 additions & 2 deletions opal/mca/common/ucx/common_ucx.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ typedef struct opal_common_ucx_module {
int progress_iterations;
int registered;
bool opal_mem_hooks;
char **tls;
char **devices;
char *tls;
char *devices;
} opal_common_ucx_module_t;

typedef struct opal_common_ucx_del_proc {
Expand Down