Skip to content

Commit 456ac7f

Browse files
author
Ralph Castain
committed
Modify the internal logic for resolve nodes/peers
The current code path for PMIx_Resolve_peers and PMIx_Resolve_nodes executes a threadshift in the preg components themselves. This is done to ensure thread safety when called from the user level. However, it causes thread-stall when someone attempts to call the regex functions from _inside_ the PMIx code base should the call occur from within an event. Accordingly, move the threadshift to the client-level functions and make the preg components just execute their algorithms. Create a new pnet/test component to verify that the prge code can be safely accessed - set that component to be selected only when the user directly specifies it. The new component will be used to validate various logical extensions during development, and can then be discarded. Signed-off-by: Ralph Castain <[email protected]>
1 parent 64c9e23 commit 456ac7f

File tree

10 files changed

+643
-190
lines changed

10 files changed

+643
-190
lines changed

opal/mca/pmix/pmix3x/pmix/src/client/pmix_client.c

Lines changed: 105 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,37 +1175,135 @@ static void _commitfn(int sd, short args, void *cbdata)
11751175
return rc;
11761176
}
11771177

1178+
static void _resolve_peers(int sd, short args, void *cbdata)
1179+
{
1180+
pmix_cb_t *cb = (pmix_cb_t*)cbdata;
1181+
pmix_status_t rc;
1182+
1183+
cb->status = pmix_preg.resolve_peers(cb->key, cb->pname.nspace,
1184+
&cb->procs, &cb->nprocs);
1185+
/* post the data so the receiving thread can acquire it */
1186+
PMIX_POST_OBJECT(cb);
1187+
PMIX_WAKEUP_THREAD(&cb->lock);
1188+
}
1189+
11781190
/* need to thread-shift this request */
11791191
PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename,
11801192
const char *nspace,
11811193
pmix_proc_t **procs, size_t *nprocs)
11821194
{
1195+
pmix_cb_t *cb;
1196+
pmix_status_t rc;
1197+
pmix_proc_t proc;
1198+
11831199
PMIX_ACQUIRE_THREAD(&pmix_global_lock);
11841200
if (pmix_globals.init_cntr <= 0) {
11851201
PMIX_RELEASE_THREAD(&pmix_global_lock);
11861202
return PMIX_ERR_INIT;
11871203
}
11881204
PMIX_RELEASE_THREAD(&pmix_global_lock);
11891205

1190-
/* set default */
1191-
*procs = NULL;
1192-
*nprocs = 0;
11931206

1194-
return pmix_preg.resolve_peers(nodename, nspace, procs, nprocs);
1207+
cb = PMIX_NEW(pmix_cb_t);
1208+
cb->key = (char*)nodename;
1209+
cb->pname.nspace = strdup(nspace);
1210+
1211+
PMIX_THREADSHIFT(cb, _resolve_peers);
1212+
1213+
/* wait for the result */
1214+
PMIX_WAIT_THREAD(&cb->lock);
1215+
1216+
/* if the nspace wasn't found, then we need to
1217+
* ask the server for that info */
1218+
if (PMIX_ERR_INVALID_NAMESPACE == cb->status) {
1219+
(void)strncpy(proc.nspace, nspace, PMIX_MAX_NSLEN);
1220+
proc.rank = PMIX_RANK_WILDCARD;
1221+
/* any key will suffice as it will bring down
1222+
* the entire data blob */
1223+
rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, NULL);
1224+
if (PMIX_SUCCESS != rc) {
1225+
PMIX_RELEASE(cb);
1226+
return rc;
1227+
}
1228+
/* retry the fetch */
1229+
cb->lock.active = true;
1230+
PMIX_THREADSHIFT(cb, _resolve_peers);
1231+
PMIX_WAIT_THREAD(&cb->lock);
1232+
}
1233+
*procs = cb->procs;
1234+
*nprocs = cb->nprocs;
1235+
1236+
rc = cb->status;
1237+
PMIX_RELEASE(cb);
1238+
return rc;
1239+
}
1240+
1241+
static void _resolve_nodes(int fd, short args, void *cbdata)
1242+
{
1243+
pmix_cb_t *cb = (pmix_cb_t*)cbdata;
1244+
char *regex, **names;
1245+
1246+
/* get a regular expression describing the PMIX_NODE_MAP */
1247+
cb->status = pmix_preg.resolve_nodes(cb->pname.nspace, &regex);
1248+
if (PMIX_SUCCESS == cb->status) {
1249+
/* parse it into an argv array of names */
1250+
cb->status = pmix_preg.parse_nodes(regex, &names);
1251+
if (PMIX_SUCCESS == cb->status) {
1252+
/* assemble it into a comma-delimited list */
1253+
cb->key = pmix_argv_join(names, ',');
1254+
pmix_argv_free(names);
1255+
} else {
1256+
free(regex);
1257+
}
1258+
}
1259+
/* post the data so the receiving thread can acquire it */
1260+
PMIX_POST_OBJECT(cb);
1261+
PMIX_WAKEUP_THREAD(&cb->lock);
11951262
}
11961263

11971264
/* need to thread-shift this request */
11981265
PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const char *nspace, char **nodelist)
11991266
{
1267+
pmix_cb_t *cb;
1268+
pmix_status_t rc;
1269+
pmix_proc_t proc;
1270+
12001271
PMIX_ACQUIRE_THREAD(&pmix_global_lock);
12011272
if (pmix_globals.init_cntr <= 0) {
12021273
PMIX_RELEASE_THREAD(&pmix_global_lock);
12031274
return PMIX_ERR_INIT;
12041275
}
12051276
PMIX_RELEASE_THREAD(&pmix_global_lock);
12061277

1207-
/* set default */
1208-
*nodelist = NULL;
1278+
cb = PMIX_NEW(pmix_cb_t);
1279+
cb->pname.nspace = strdup(nspace);
1280+
1281+
PMIX_THREADSHIFT(cb, _resolve_nodes);
1282+
1283+
/* wait for the result */
1284+
PMIX_WAIT_THREAD(&cb->lock);
1285+
1286+
/* if the nspace wasn't found, then we need to
1287+
* ask the server for that info */
1288+
if (PMIX_ERR_INVALID_NAMESPACE == cb->status) {
1289+
(void)strncpy(proc.nspace, nspace, PMIX_MAX_NSLEN);
1290+
proc.rank = PMIX_RANK_WILDCARD;
1291+
/* any key will suffice as it will bring down
1292+
* the entire data blob */
1293+
rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, NULL);
1294+
if (PMIX_SUCCESS != rc) {
1295+
PMIX_RELEASE(cb);
1296+
return rc;
1297+
}
1298+
/* retry the fetch */
1299+
cb->lock.active = true;
1300+
PMIX_THREADSHIFT(cb, _resolve_nodes);
1301+
PMIX_WAIT_THREAD(&cb->lock);
1302+
}
1303+
/* the string we want is in the key field */
1304+
*nodelist = cb->key;
12091305

1210-
return pmix_preg.resolve_nodes(nspace, nodelist);
1306+
rc = cb->status;
1307+
PMIX_RELEASE(cb);
1308+
return rc;
12111309
}

opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,8 @@ static pmix_status_t store_map(pmix_hash_table_t *ht,
333333
}
334334

335335
/* store the comma-delimited list of nodes hosting
336-
* procs in this nspace */
336+
* procs in this nspace in case someone using PMIx v2
337+
* requests it */
337338
kp2 = PMIX_NEW(pmix_kval_t);
338339
kp2->key = strdup(PMIX_NODE_LIST);
339340
kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t));
@@ -397,6 +398,19 @@ pmix_status_t hash_cache_job_info(struct pmix_nspace_t *ns,
397398
ht = &trk->internal;
398399
for (n=0; n < ninfo; n++) {
399400
if (0 == strcmp(info[n].key, PMIX_NODE_MAP)) {
401+
/* store the node map itself since that is
402+
* what v3 uses */
403+
kp2 = PMIX_NEW(pmix_kval_t);
404+
kp2->key = strdup(PMIX_NODE_MAP);
405+
kp2->value = (pmix_value_t*)malloc(sizeof(pmix_value_t));
406+
kp2->value->type = PMIX_STRING;
407+
kp2->value->data.string = strdup(info[n].value.data.string);
408+
if (PMIX_SUCCESS != (rc = pmix_hash_store(ht, PMIX_RANK_WILDCARD, kp2))) {
409+
PMIX_ERROR_LOG(rc);
410+
PMIX_RELEASE(kp2);
411+
return rc;
412+
}
413+
400414
/* parse the regex to get the argv array of node names */
401415
if (PMIX_SUCCESS != (rc = pmix_preg.parse_nodes(info[n].value.data.string, &nodes))) {
402416
PMIX_ERROR_LOG(rc);

opal/mca/pmix/pmix3x/pmix/src/mca/pnet/opa/pnet_opa.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "src/util/error.h"
3838
#include "src/util/output.h"
3939
#include "src/util/pmix_environ.h"
40+
#include "src/mca/preg/preg.h"
4041

4142
#include "src/mca/pnet/pnet.h"
4243
#include "src/mca/pnet/base/base.h"
@@ -298,6 +299,10 @@ static pmix_status_t setup_local_network(pmix_nspace_t *nptr,
298299
size_t n;
299300
pmix_status_t rc;
300301
pmix_kval_t *kv;
302+
char *nodestring, **nodes;
303+
pmix_proc_t *procs;
304+
size_t nprocs;
305+
301306

302307
if (NULL != info) {
303308
for (n=0; n < ninfo; n++) {
@@ -321,6 +326,7 @@ static pmix_status_t setup_local_network(pmix_nspace_t *nptr,
321326
}
322327
}
323328
}
329+
324330
return PMIX_SUCCESS;
325331
}
326332

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# -*- makefile -*-
2+
#
3+
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4+
# University Research and Technology
5+
# Corporation. All rights reserved.
6+
# Copyright (c) 2004-2005 The University of Tennessee and The University
7+
# of Tennessee Research Foundation. All rights
8+
# reserved.
9+
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10+
# University of Stuttgart. All rights reserved.
11+
# Copyright (c) 2004-2005 The Regents of the University of California.
12+
# All rights reserved.
13+
# Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
14+
# Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
15+
# Copyright (c) 2017 Research Organization for Information Science
16+
# and Technology (RIST). All rights reserved.
17+
# $COPYRIGHT$
18+
#
19+
# Additional copyrights may follow
20+
#
21+
# $HEADER$
22+
#
23+
24+
headers = pnet_test.h
25+
sources = \
26+
pnet_test_component.c \
27+
pnet_test.c
28+
29+
# Make the output library in this directory, and name it either
30+
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
31+
# (for static builds).
32+
33+
if MCA_BUILD_pmix_pnet_test_DSO
34+
lib =
35+
lib_sources =
36+
component = mca_pnet_test.la
37+
component_sources = $(headers) $(sources)
38+
else
39+
lib = libmca_pnet_test.la
40+
lib_sources = $(headers) $(sources)
41+
component =
42+
component_sources =
43+
endif
44+
45+
mcacomponentdir = $(pmixlibdir)
46+
mcacomponent_LTLIBRARIES = $(component)
47+
mca_pnet_test_la_SOURCES = $(component_sources)
48+
mca_pnet_test_la_LDFLAGS = -module -avoid-version
49+
50+
noinst_LTLIBRARIES = $(lib)
51+
libmca_pnet_test_la_SOURCES = $(lib_sources)
52+
libmca_pnet_test_la_LDFLAGS = -module -avoid-version

0 commit comments

Comments
 (0)