Skip to content

Commit fd6fe3d

Browse files
authored
Merge pull request #12175 from wenduwan/mt_completion_callback
mtl/ofi: avoid accessing request object after completion callback(restart ci)
2 parents d261a4a + 6d79aae commit fd6fe3d

File tree

1 file changed

+15
-9
lines changed

1 file changed

+15
-9
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ opal_mutex_atomic_unlock(&ompi_mtl_ofi.ofi_ctxt[ctxt_id].context_lock)
131131
__opal_attribute_always_inline__ static inline int
132132
ompi_mtl_ofi_context_progress(int ctxt_id)
133133
{
134-
int count = 0, i, events_read;
134+
int count = 0, i, events_read, req_type = -1;
135135
ompi_mtl_ofi_request_t *ofi_req = NULL;
136136
struct fi_cq_err_entry error = { 0 };
137137
ssize_t ret;
@@ -151,12 +151,13 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
151151
if (NULL != ompi_mtl_ofi_wc[i].op_context) {
152152
ofi_req = TO_OFI_REQ(ompi_mtl_ofi_wc[i].op_context);
153153
assert(ofi_req);
154+
req_type = ofi_req->type;
154155
ret = ofi_req->event_callback(&ompi_mtl_ofi_wc[i], ofi_req);
155156
if (OMPI_SUCCESS != ret) {
156157
opal_output(0,
157158
"%s:%d: Error returned by request (type: %d) event callback: %zd.\n"
158159
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
159-
__FILE__, __LINE__, ofi_req->type, ret);
160+
__FILE__, __LINE__, req_type, ret);
160161
fflush(stderr);
161162
exit(1);
162163
}
@@ -192,11 +193,13 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
192193
assert(error.op_context);
193194
ofi_req = TO_OFI_REQ(error.op_context);
194195
assert(ofi_req);
196+
req_type = ofi_req->type;
195197
ret = ofi_req->error_callback(&error, ofi_req);
196198
if (OMPI_SUCCESS != ret) {
197-
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
198-
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
199-
__FILE__, __LINE__, ret);
199+
opal_output(0,
200+
"%s:%d: Error returned by request (type: %d) error callback: %zd.\n"
201+
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
202+
__FILE__, __LINE__, req_type, ret);
200203
fflush(stderr);
201204
exit(1);
202205
}
@@ -1260,7 +1263,7 @@ __opal_attribute_always_inline__ static inline int
12601263
ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
12611264
ompi_mtl_ofi_request_t *ofi_req)
12621265
{
1263-
int ompi_ret;
1266+
int ompi_ret = OMPI_SUCCESS;
12641267
int src = mtl_ofi_get_source(wc);
12651268
ompi_status_public_t *status = NULL;
12661269

@@ -1320,9 +1323,11 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
13201323
}
13211324
}
13221325

1326+
ompi_ret = status->MPI_ERROR;
1327+
13231328
ofi_req->super.completion_callback(&ofi_req->super);
13241329

1325-
return status->MPI_ERROR;
1330+
return ompi_ret;
13261331
}
13271332

13281333
/**
@@ -1462,13 +1467,13 @@ __opal_attribute_always_inline__ static inline int
14621467
ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
14631468
ompi_mtl_ofi_request_t *ofi_req)
14641469
{
1470+
int ompi_ret = OMPI_SUCCESS;
14651471
struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req;
14661472
ompi_status_public_t *status = &mrecv_req->ompi_req->req_status;
14671473
status->MPI_SOURCE = mtl_ofi_get_source(wc);
14681474
status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag);
14691475
status->MPI_ERROR = MPI_SUCCESS;
14701476
status->_ucount = wc->len;
1471-
int ompi_ret;
14721477

14731478
ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
14741479

@@ -1483,11 +1488,12 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
14831488
}
14841489
}
14851490

1491+
ompi_ret = status->MPI_ERROR;
14861492
free(ofi_req);
14871493

14881494
mrecv_req->completion_callback(mrecv_req);
14891495

1490-
return status->MPI_ERROR;
1496+
return ompi_ret;
14911497
}
14921498

14931499
/**

0 commit comments

Comments
 (0)