Skip to content

Commit 2f4ba1b

Browse files
committed
ofi/common: fix code that broke sessions
With sessions initialization model (section 11.3 of MPI 4 standard) MPI may be initialized and finalized any number of times. This patch refactors code that was assuming a one shot init/finalize sequence for initializing Open MPI and its MCA param space The underlying problem with the replaced code was that when an app calls MPI_Session_finalize and there are no more sessions active, the MCA param space is destroyed. So if one does not build Open MPI to use dynamically load components, and a component is using static variables in a way that assumes the MCA param space is always preserved if a static variable is set to some value, then things break if a subsequent MPI_Session_init is invoked. Related to #12869 Signed-off-by: Howard Pritchard <[email protected]>
1 parent 86961a2 commit 2f4ba1b

File tree

1 file changed

+22
-8
lines changed

1 file changed

+22
-8
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -324,10 +324,11 @@ int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char
324324

325325
int opal_common_ofi_mca_register(const mca_base_component_t *component)
326326
{
327-
static int include_index = -1;
328-
static int exclude_index = -1;
329-
static int verbose_index = -1;
330-
static int accelerator_rank_index = -1;
327+
int include_index;
328+
int exclude_index;
329+
int verbose_index;
330+
int accelerator_rank_index;
331+
int param;
331332
int ret;
332333

333334
if (fi_version() < FI_VERSION(1, 0)) {
@@ -336,7 +337,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
336337

337338
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
338339

339-
if (0 > include_index) {
340+
param = mca_base_var_find("opal", "opal_common", "ofi", "provider_include");
341+
if (0 > param) {
340342
/*
341343
* this monkey business is needed because of the way the MCA VARs stuff tries to handle
342344
* pointers to strings when when destructing the MCA var database. If you don't do
@@ -359,9 +361,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
359361
ret = include_index;
360362
goto err;
361363
}
364+
} else {
365+
include_index = param;
362366
}
363367

364-
if (0 > exclude_index) {
368+
param = mca_base_var_find("opal", "opal_common", "ofi", "provider_exclude");
369+
if (0 > param) {
365370
if (NULL == opal_common_ofi.prov_exclude) {
366371
opal_common_ofi.prov_exclude = (char **) malloc(sizeof(char *));
367372
assert(NULL != opal_common_ofi.prov_exclude);
@@ -378,9 +383,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
378383
ret = exclude_index;
379384
goto err;
380385
}
386+
} else {
387+
exclude_index = param;
381388
}
382389

383-
if (0 > verbose_index) {
390+
param = mca_base_var_find("opal", "opal_common", "ofi", "verbose");
391+
if (0 > param) {
384392
verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose",
385393
"Verbose level of the OFI components",
386394
MCA_BASE_VAR_TYPE_INT, NULL, 0,
@@ -391,9 +399,13 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
391399
ret = verbose_index;
392400
goto err;
393401
}
402+
} else {
403+
verbose_index = param;
394404
}
395405

396-
if (0 > accelerator_rank_index) {
406+
407+
param = mca_base_var_find("opal", "opal_common", "ofi", "accelerator_rank");
408+
if (0 > param) {
397409
accelerator_rank_index
398410
= mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank",
399411
"Process rank(non-negative) on the selected accelerator device",
@@ -404,6 +416,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
404416
ret = accelerator_rank_index;
405417
goto err;
406418
}
419+
} else {
420+
accelerator_rank_index = param;
407421
}
408422

409423
if (component) {

0 commit comments

Comments
 (0)