Skip to content

Commit e9f378e

Browse files
authored
Merge pull request #5500 from tkordenbrock/topic/master/fix.PtlMEUnlink.in.use
coll-portals4: retry PtlMEUnlink() if PTL_IN_USE
2 parents c294bbc + f3f2a82 commit e9f378e

6 files changed

+62
-27
lines changed

ompi/mca/coll/portals4/coll_portals4_allreduce.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -343,15 +343,38 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
343343
static int
344344
allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
345345
{
346+
int ret;
347+
346348
if (request->u.allreduce.is_optim) {
347349
PtlAtomicSync();
348350

349351
if (request->u.allreduce.child_nb) {
350-
PtlCTFree(request->u.allreduce.ack_ct_h);
352+
ret = PtlCTFree(request->u.allreduce.ack_ct_h);
353+
if (PTL_OK != ret) {
354+
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
355+
"%s:%d: PtlCTFree failed: %d\n",
356+
__FILE__, __LINE__, ret);
357+
return OMPI_ERROR;
358+
}
351359
}
352360

353-
PtlMEUnlink(request->u.allreduce.data_me_h);
354-
PtlCTFree(request->u.allreduce.trig_ct_h);
361+
do {
362+
ret = PtlMEUnlink(request->u.allreduce.data_me_h);
363+
} while (PTL_IN_USE == ret);
364+
if (PTL_OK != ret) {
365+
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
366+
"%s:%d: PtlMEUnlink failed: %d\n",
367+
__FILE__, __LINE__, ret);
368+
return OMPI_ERROR;
369+
}
370+
371+
ret = PtlCTFree(request->u.allreduce.trig_ct_h);
372+
if (PTL_OK != ret) {
373+
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
374+
"%s:%d: PtlCTFree failed: %d\n",
375+
__FILE__, __LINE__, ret);
376+
return OMPI_ERROR;
377+
}
355378
}
356379

357380
return (OMPI_SUCCESS);

ompi/mca/coll/portals4/coll_portals4_barrier.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,9 @@ barrier_hypercube_bottom(ompi_coll_portals4_request_t *request)
206206
int ret;
207207

208208
/* cleanup */
209-
ret = PtlMEUnlink(request->u.barrier.data_me_h);
209+
do {
210+
ret = PtlMEUnlink(request->u.barrier.data_me_h);
211+
} while (PTL_IN_USE == ret);
210212
if (PTL_OK != ret) {
211213
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
212214
"%s:%d: PtlMEUnlink failed: %d\n",

ompi/mca/coll/portals4/coll_portals4_component.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,15 +285,19 @@ portals4_close(void)
285285
mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE;
286286

287287
if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) {
288-
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
288+
do {
289+
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
290+
} while (PTL_IN_USE == ret);
289291
if (PTL_OK != ret) {
290292
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
291293
"%s:%d: PtlMEUnlink failed: %d\n",
292294
__FILE__, __LINE__, ret);
293295
}
294296
}
295297
if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) {
296-
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
298+
do {
299+
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
300+
} while (PTL_IN_USE == ret);
297301
if (PTL_OK != ret) {
298302
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
299303
"%s:%d: PtlMEUnlink failed: %d\n",

ompi/mca/coll/portals4/coll_portals4_gather.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,9 @@ cleanup_gather_handles(ompi_coll_portals4_request_t *request)
460460
/**********************************/
461461
/* Cleanup Gather Handles */
462462
/**********************************/
463-
ret = PtlMEUnlink(request->u.gather.gather_meh);
463+
do {
464+
ret = PtlMEUnlink(request->u.gather.gather_meh);
465+
} while (PTL_IN_USE == ret);
464466
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
465467

466468
ret = PtlCTFree(request->u.gather.gather_cth);
@@ -484,7 +486,9 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
484486
/**********************************/
485487
/* Cleanup Sync Handles */
486488
/**********************************/
487-
ret = PtlMEUnlink(request->u.gather.sync_meh);
489+
do {
490+
ret = PtlMEUnlink(request->u.gather.sync_meh);
491+
} while (PTL_IN_USE == ret);
488492
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
489493

490494
ret = PtlCTFree(request->u.gather.sync_cth);

ompi/mca/coll/portals4/coll_portals4_reduce.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -340,24 +340,38 @@ reduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
340340
static int
341341
reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
342342
{
343+
int ret, line;
344+
343345
if (request->u.reduce.is_optim) {
344346
PtlAtomicSync();
345347

346348
if (request->u.reduce.use_ack_ct_h) {
347-
PtlCTFree(request->u.reduce.ack_ct_h);
349+
ret = PtlCTFree(request->u.reduce.ack_ct_h);
350+
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
348351
}
349352

350353
if (request->u.reduce.child_nb) {
351-
PtlMEUnlink(request->u.reduce.data_me_h);
354+
do {
355+
ret = PtlMEUnlink(request->u.reduce.data_me_h);
356+
} while (PTL_IN_USE == ret);
357+
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
352358
}
353359

354-
PtlCTFree(request->u.reduce.trig_ct_h);
360+
ret = PtlCTFree(request->u.reduce.trig_ct_h);
361+
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
355362

356363
if (request->u.reduce.free_buffer) {
357364
free(request->u.reduce.free_buffer);
358365
}
359366
}
360367
return (OMPI_SUCCESS);
368+
369+
err_hdlr:
370+
opal_output(ompi_coll_base_framework.framework_output,
371+
"%s:%4d:%4d\tError occurred ret=%d",
372+
__FILE__, __LINE__, line, ret);
373+
374+
return ret;
361375
}
362376

363377

ompi/mca/coll/portals4/coll_portals4_scatter.c

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -253,14 +253,8 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request)
253253
/**********************************/
254254
do {
255255
ret = PtlMEUnlink(request->u.scatter.scatter_meh);
256-
if (PTL_IN_USE == ret) {
257-
opal_output(ompi_coll_base_framework.framework_output,
258-
"%s:%4d: scatter_meh still in use (ret=%d, rank %2d)",
259-
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
260-
continue;
261-
}
262-
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
263-
} while (ret == PTL_IN_USE);
256+
} while (PTL_IN_USE == ret);
257+
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
264258

265259
ret = PtlCTFree(request->u.scatter.scatter_cth);
266260
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
@@ -292,14 +286,8 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
292286
/**********************************/
293287
do {
294288
ret = PtlMEUnlink(request->u.scatter.sync_meh);
295-
if (PTL_IN_USE == ret) {
296-
opal_output(ompi_coll_base_framework.framework_output,
297-
"%s:%4d: sync_meh still in use (ret=%d, rank %2d)",
298-
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
299-
continue;
300-
}
301-
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
302-
} while (ret == PTL_IN_USE);
289+
} while (PTL_IN_USE == ret);
290+
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
303291

304292
ret = PtlCTFree(request->u.scatter.sync_cth);
305293
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }

0 commit comments

Comments
 (0)