Skip to content

Commit c1bab0d

Browse files
author
Ralph Castain
committed
Implement support for proctable queries
Signed-off-by: Ralph Castain <[email protected]>
1 parent d509e8d commit c1bab0d

File tree

12 files changed

+716
-93
lines changed

12 files changed

+716
-93
lines changed

examples/debugger.c

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ typedef struct {
7474
* info from a query */
7575
typedef struct {
7676
mylock_t lock;
77+
pmix_status_t status;
7778
pmix_info_t *info;
7879
size_t ninfo;
7980
} myquery_data_t;
@@ -105,14 +106,14 @@ static void cbfunc(pmix_status_t status,
105106
myquery_data_t *mq = (myquery_data_t*)cbdata;
106107
size_t n;
107108

109+
mq->status = status;
108110
/* save the returned info - the PMIx library "owns" it
109111
* and will release it and perform other cleanup actions
110112
* when release_fn is called */
111113
if (0 < ninfo) {
112114
PMIX_INFO_CREATE(mq->info, ninfo);
113115
mq->ninfo = ninfo;
114116
for (n=0; n < ninfo; n++) {
115-
fprintf(stderr, "Transferring %s\n", info[n].key);
116117
PMIX_INFO_XFER(&mq->info[n], &info[n]);
117118
}
118119
}
@@ -337,9 +338,11 @@ int main(int argc, char **argv)
337338
/* check to see if we are using an intermediate launcher - we only
338339
* support those we recognize */
339340
found = false;
340-
for (n=0; NULL != launchers[n]; n++) {
341-
if (0 == strcmp(argv[1], launchers[n])) {
342-
found = true;
341+
if (1 < argc) {
342+
for (n=0; NULL != launchers[n]; n++) {
343+
if (0 == strcmp(argv[1], launchers[n])) {
344+
found = true;
345+
}
343346
}
344347
}
345348
if (found) {
@@ -507,6 +510,61 @@ int main(int argc, char **argv)
507510
PMIX_INFO_FREE(info, ninfo);
508511
PMIX_APP_FREE(app, napps);
509512

513+
/* get the proctable for this nspace */
514+
PMIX_QUERY_CREATE(query, 1);
515+
PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_PROC_TABLE);
516+
query[0].nqual = 1;
517+
PMIX_INFO_CREATE(query->qualifiers, query[0].nqual);
518+
PMIX_INFO_LOAD(&query->qualifiers[0], PMIX_NSPACE, clientspace, PMIX_STRING);
519+
520+
DEBUG_CONSTRUCT_LOCK(&myquery_data.lock);
521+
myquery_data.info = NULL;
522+
myquery_data.ninfo = 0;
523+
524+
if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, 1, cbfunc, (void*)&myquery_data))) {
525+
fprintf(stderr, "Debugger[%s:%d] Proctable query failed: %d\n", myproc.nspace, myproc.rank, rc);
526+
goto done;
527+
}
528+
/* wait to get a response */
529+
DEBUG_WAIT_THREAD(&myquery_data.lock);
530+
DEBUG_DESTRUCT_LOCK(&myquery_data.lock);
531+
532+
/* we should have gotten a response */
533+
if (PMIX_SUCCESS != myquery_data.status) {
534+
fprintf(stderr, "Debugger[%s:%d] Proctable query failed: %s\n",
535+
myproc.nspace, myproc.rank, PMIx_Error_string(myquery_data.status));
536+
goto done;
537+
}
538+
/* there should hvae been data */
539+
if (NULL == myquery_data.info || 0 == myquery_data.ninfo) {
540+
fprintf(stderr, "Debugger[%s:%d] Proctable query return no results\n",
541+
myproc.nspace, myproc.rank);
542+
goto done;
543+
}
544+
/* the query should have returned a data_array */
545+
if (PMIX_DATA_ARRAY != myquery_data.info[0].value.type) {
546+
fprintf(stderr, "Debugger[%s:%d] Query returned incorrect data type: %s\n", PMIx_Data_type_string(myquery_data.info[0].value.type));
547+
return -1;
548+
}
549+
if (NULL == myquery_data.info[0].value.data.darray->array) {
550+
fprintf(stderr, "Debugger[%s:%d] Query returned no proctable info\n");
551+
goto done;
552+
}
553+
/* the data array consists of a struct:
554+
* size_t size;
555+
* void* array;
556+
*
557+
* In this case, the array is composed of pmix_proc_info_t structs:
558+
* pmix_proc_t proc; // contains the nspace,rank of this proc
559+
* char* hostname;
560+
* char* executable_name;
561+
* pid_t pid;
562+
* int exit_code;
563+
* pmix_proc_state_t state;
564+
*/
565+
fprintf(stderr, "Received %d array elements\n", (int)myquery_data.info[0].value.data.darray->size);
566+
goto done;
567+
510568
/* now launch the debugger daemons */
511569
if (PMIX_SUCCESS != (rc = spawn_debugger(clientspace))) {
512570
goto done;

opal/class/opal_interval_tree.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
#include "opal/class/opal_interval_tree.h"
2929
#include <limits.h>
30+
#include <stddef.h>
31+
#include <string.h>
3032

3133
/* Private functions */
3234
static void opal_interval_tree_insert_node (opal_interval_tree_t *tree, opal_interval_tree_node_t *node);

opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -573,11 +573,13 @@ typedef uint8_t pmix_proc_state_t;
573573
#define PMIX_PROC_STATE_ABORTED_BY_SIG (PMIX_PROC_STATE_ERROR + 4) /* process aborted by signal */
574574
#define PMIX_PROC_STATE_TERM_WO_SYNC (PMIX_PROC_STATE_ERROR + 5) /* process exit'd w/o calling PMIx_Finalize */
575575
#define PMIX_PROC_STATE_COMM_FAILED (PMIX_PROC_STATE_ERROR + 6) /* process communication has failed */
576-
#define PMIX_PROC_STATE_CALLED_ABORT (PMIX_PROC_STATE_ERROR + 7) /* process called "PMIx_Abort" */
577-
#define PMIX_PROC_STATE_MIGRATING (PMIX_PROC_STATE_ERROR + 8) /* process failed and is waiting for resources before restarting */
578-
#define PMIX_PROC_STATE_CANNOT_RESTART (PMIX_PROC_STATE_ERROR + 9) /* process failed and cannot be restarted */
579-
#define PMIX_PROC_STATE_TERM_NON_ZERO (PMIX_PROC_STATE_ERROR + 10) /* process exited with a non-zero status, indicating abnormal */
580-
#define PMIX_PROC_STATE_FAILED_TO_LAUNCH (PMIX_PROC_STATE_ERROR + 11) /* unable to launch process */
576+
#define PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED (PMIX_PROC_STATE_ERROR + 7) /* process exceeded a sensor limit */
577+
#define PMIX_PROC_STATE_CALLED_ABORT (PMIX_PROC_STATE_ERROR + 8) /* process called "PMIx_Abort" */
578+
#define PMIX_PROC_STATE_HEARTBEAT_FAILED (PMIX_PROC_STATE_ERROR + 9) /* process failed to send heartbeat w/in time limit */
579+
#define PMIX_PROC_STATE_MIGRATING (PMIX_PROC_STATE_ERROR + 10) /* process failed and is waiting for resources before restarting */
580+
#define PMIX_PROC_STATE_CANNOT_RESTART (PMIX_PROC_STATE_ERROR + 11) /* process failed and cannot be restarted */
581+
#define PMIX_PROC_STATE_TERM_NON_ZERO (PMIX_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
582+
#define PMIX_PROC_STATE_FAILED_TO_LAUNCH (PMIX_PROC_STATE_ERROR + 13) /* unable to launch process */
581583

582584

583585
/**** PMIX ERROR CONSTANTS ****/
@@ -1356,16 +1358,20 @@ struct pmix_info_t {
13561358
} \
13571359
} while (0)
13581360

1359-
#define PMIX_INFO_LOAD(m, k, v, t) \
1360-
do { \
1361-
(void)strncpy((m)->key, (k), PMIX_MAX_KEYLEN); \
1362-
pmix_value_load(&((m)->value), (v), (t)); \
1363-
} while (0)
1364-
#define PMIX_INFO_XFER(d, s) \
1361+
#define PMIX_INFO_LOAD(m, k, v, t) \
13651362
do { \
1366-
(void)strncpy((d)->key, (s)->key, PMIX_MAX_KEYLEN); \
1367-
(d)->flags = (s)->flags; \
1368-
pmix_value_xfer(&(d)->value, &(s)->value); \
1363+
if (NULL != (k)) { \
1364+
(void)strncpy((m)->key, (k), PMIX_MAX_KEYLEN); \
1365+
} \
1366+
pmix_value_load(&((m)->value), (v), (t)); \
1367+
} while (0)
1368+
#define PMIX_INFO_XFER(d, s) \
1369+
do { \
1370+
if (NULL != (s)->key) { \
1371+
(void)strncpy((d)->key, (s)->key, PMIX_MAX_KEYLEN); \
1372+
} \
1373+
(d)->flags = (s)->flags; \
1374+
pmix_value_xfer(&(d)->value, &(s)->value); \
13691375
} while(0)
13701376

13711377
#define PMIX_INFO_REQUIRED(m) \
@@ -1386,7 +1392,9 @@ struct pmix_info_t {
13861392
(r) = PMIX_ERR_NOMEM; \
13871393
break; \
13881394
} \
1389-
_kv->key = strdup(_info[_n].key); \
1395+
if (NULL != _info[_n].key) { \
1396+
_kv->key = strdup(_info[_n].key); \
1397+
} \
13901398
PMIX_VALUE_XFER((r), _kv->value, &_info[_n].value);\
13911399
if (PMIX_SUCCESS != (r)) { \
13921400
PMIX_RELEASE(_kv); \

opal/mca/pmix/pmix3x/pmix/src/common/pmix_query.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
6363
PMIX_BFROPS_UNPACK(rc, peer, buf, &results->status, &cnt, PMIX_STATUS);
6464
if (PMIX_SUCCESS != rc) {
6565
PMIX_ERROR_LOG(rc);
66+
results->status = rc;
6667
goto complete;
6768
}
6869
if (PMIX_SUCCESS != results->status) {
@@ -74,6 +75,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
7475
PMIX_BFROPS_UNPACK(rc, peer, buf, &results->ninfo, &cnt, PMIX_SIZE);
7576
if (PMIX_SUCCESS != rc) {
7677
PMIX_ERROR_LOG(rc);
78+
results->status = rc;
7779
goto complete;
7880
}
7981
if (0 < results->ninfo) {
@@ -82,6 +84,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
8284
PMIX_BFROPS_UNPACK(rc, peer, buf, results->info, &cnt, PMIX_INFO);
8385
if (PMIX_SUCCESS != rc) {
8486
PMIX_ERROR_LOG(rc);
87+
results->status = rc;
8588
goto complete;
8689
}
8790
}

opal/mca/pmix/pmix3x/pmix/src/common/pmix_strings.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,12 @@ PMIX_EXPORT const char* PMIx_Proc_state_string(pmix_proc_state_t state)
7171
return "PROC TERMINATED WITHOUT CALLING PMIx_Finalize";
7272
case PMIX_PROC_STATE_COMM_FAILED:
7373
return "PROC LOST COMMUNICATION";
74+
case PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED:
75+
return "PROC SENSOR BOUND EXCEEDED";
7476
case PMIX_PROC_STATE_CALLED_ABORT:
7577
return "PROC CALLED PMIx_Abort";
78+
case PMIX_PROC_STATE_HEARTBEAT_FAILED:
79+
return "PROC FAILED TO REPORT HEARTBEAT";
7680
case PMIX_PROC_STATE_MIGRATING:
7781
return "PROC WAITING TO MIGRATE";
7882
case PMIX_PROC_STATE_CANNOT_RESTART:

opal/mca/pmix/pmix3x/pmix/src/mca/bfrops/base/bfrop_base_copy.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ pmix_status_t pmix_bfrops_base_copy_pinfo(pmix_proc_info_t **dest,
373373
if (NULL == p) {
374374
return PMIX_ERR_NOMEM;
375375
}
376+
memcpy(&p->proc, &src->proc, sizeof(pmix_proc_t));
376377
if (NULL != src->hostname) {
377378
p->hostname = strdup(src->hostname);
378379
}
@@ -623,7 +624,7 @@ pmix_status_t pmix_bfrops_base_copy_darray(pmix_data_array_t **dest,
623624
p1 = (pmix_info_t*)p->array;
624625
s1 = (pmix_info_t*)src->array;
625626
for (n=0; n < src->size; n++) {
626-
PMIX_INFO_LOAD(&p1[n], s1[n].key, &s1[n].value.data.flag, s1[n].value.type);
627+
PMIX_INFO_XFER(&p1[n], &s1[n]);
627628
}
628629
break;
629630
case PMIX_PDATA:
@@ -635,7 +636,7 @@ pmix_status_t pmix_bfrops_base_copy_darray(pmix_data_array_t **dest,
635636
pd = (pmix_pdata_t*)p->array;
636637
sd = (pmix_pdata_t*)src->array;
637638
for (n=0; n < src->size; n++) {
638-
PMIX_PDATA_LOAD(&pd[n], &sd[n].proc, sd[n].key, &sd[n].value.data.flag, sd[n].value.type);
639+
PMIX_PDATA_XFER(&pd[n], &sd[n]);
639640
}
640641
break;
641642
case PMIX_BUFFER:

0 commit comments

Comments
 (0)