Skip to content

Commit a578e45

Browse files
committed
mpi: retain operation and datatype in non blocking collectives
MPI standard states a user MPI_Op and/or user MPI_Datatype can be free'd after a call to a non blocking collective and before the non-blocking collective completes. Retain user (only) MPI_Op and MPI_Datatype when the non blocking call is invoked, and set a request callback so they are free'd when the MPI_Request completes. Thanks Thomas Ponweiser for reporting this Fixes open-mpi#2151 Fixes open-mpi#1304 Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent c2d35aa commit a578e45

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+805
-139
lines changed

ompi/mca/coll/base/coll_base_util.c

Lines changed: 164 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2017 Research Organization for Information Science
13-
* and Technology (RIST). All rights reserved.
12+
* Copyright (c) 2014-2019 Research Organization for Information Science
13+
* and Technology (RIST). All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -103,3 +103,165 @@ int ompi_rounddown(int num, int factor)
103103
num /= factor;
104104
return num * factor; /* floor(num / factor) * factor */
105105
}
106+
107+
static void release_objs_callback(struct ompi_coll_base_nbc_request_t *request) {
108+
if (NULL != request->data.objs.objs[0]) {
109+
OBJ_RELEASE(request->data.objs.objs[0]);
110+
}
111+
if (NULL != request->data.objs.objs[1]) {
112+
OBJ_RELEASE(request->data.objs.objs[1]);
113+
}
114+
}
115+
116+
static int complete_objs_callback(struct ompi_request_t *req) {
117+
struct ompi_coll_base_nbc_request_t *request = (ompi_coll_base_nbc_request_t *)req;
118+
int rc = OMPI_SUCCESS;
119+
assert (NULL != request);
120+
if (NULL != request->cb.req_complete_cb) {
121+
rc = request->cb.req_complete_cb(request->req_complete_cb_data);
122+
}
123+
release_objs_callback(request);
124+
return rc;
125+
}
126+
127+
static int free_objs_callback(struct ompi_request_t **rptr) {
128+
struct ompi_coll_base_nbc_request_t *request = *(ompi_coll_base_nbc_request_t **)rptr;
129+
int rc = OMPI_SUCCESS;
130+
if (NULL != request->cb.req_free) {
131+
rc = request->cb.req_free(rptr);
132+
}
133+
release_objs_callback(request);
134+
return rc;
135+
}
136+
137+
int ompi_coll_base_retain_op( ompi_request_t *req, ompi_op_t *op,
138+
ompi_datatype_t *type) {
139+
ompi_coll_base_nbc_request_t *request = (ompi_coll_base_nbc_request_t *)req;
140+
bool retain = false;
141+
if (!ompi_op_is_intrinsic(op)) {
142+
OBJ_RETAIN(op);
143+
request->data.op.op = op;
144+
retain = true;
145+
}
146+
if (!ompi_datatype_is_predefined(type)) {
147+
OBJ_RETAIN(type);
148+
request->data.op.datatype = type;
149+
retain = true;
150+
}
151+
if (OPAL_UNLIKELY(retain)) {
152+
if (req->req_persistent) {
153+
request->cb.req_free = req->req_free;
154+
req->req_free = free_objs_callback;
155+
} else {
156+
request->cb.req_complete_cb = req->req_complete_cb;
157+
request->req_complete_cb_data = req->req_complete_cb_data;
158+
req->req_complete_cb = complete_objs_callback;
159+
req->req_complete_cb_data = request;
160+
}
161+
}
162+
return OMPI_SUCCESS;
163+
}
164+
165+
int ompi_coll_base_retain_datatypes( ompi_request_t *req, ompi_datatype_t *stype,
166+
ompi_datatype_t *rtype) {
167+
ompi_coll_base_nbc_request_t *request = (ompi_coll_base_nbc_request_t *)req;
168+
bool retain = false;
169+
if (NULL != stype && !ompi_datatype_is_predefined(stype)) {
170+
OBJ_RETAIN(stype);
171+
request->data.types.stype = stype;
172+
retain = true;
173+
}
174+
if (NULL != rtype && !ompi_datatype_is_predefined(rtype)) {
175+
OBJ_RETAIN(rtype);
176+
request->data.types.rtype = rtype;
177+
retain = true;
178+
}
179+
if (OPAL_UNLIKELY(retain)) {
180+
if (req->req_persistent) {
181+
request->cb.req_free = req->req_free;
182+
req->req_free = free_objs_callback;
183+
} else {
184+
request->cb.req_complete_cb = req->req_complete_cb;
185+
request->req_complete_cb_data = req->req_complete_cb_data;
186+
req->req_complete_cb = complete_objs_callback;
187+
req->req_complete_cb_data = request;
188+
}
189+
}
190+
return OMPI_SUCCESS;
191+
}
192+
193+
static void release_vecs_callback(ompi_coll_base_nbc_request_t *request) {
194+
ompi_communicator_t *comm = request->super.req_mpi_object.comm;
195+
int count = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
196+
for (int i=0; i<count; i++) {
197+
if (NULL != request->data.vecs.stypes && NULL != request->data.vecs.stypes[i]) {
198+
OMPI_DATATYPE_RELEASE(request->data.vecs.stypes[i]);
199+
}
200+
if (NULL != request->data.vecs.rtypes && NULL != request->data.vecs.rtypes[i]) {
201+
OMPI_DATATYPE_RELEASE(request->data.vecs.rtypes[i]);
202+
}
203+
}
204+
}
205+
206+
static int complete_vecs_callback(struct ompi_request_t *req) {
207+
ompi_coll_base_nbc_request_t *request = (ompi_coll_base_nbc_request_t *)req;
208+
int rc = OMPI_SUCCESS;
209+
assert (NULL != request);
210+
if (NULL != request->cb.req_complete_cb) {
211+
rc = request->cb.req_complete_cb(request->req_complete_cb_data);
212+
}
213+
release_vecs_callback(request);
214+
return rc;
215+
}
216+
217+
static int free_vecs_callback(struct ompi_request_t **rptr) {
218+
struct ompi_coll_base_nbc_request_t *request = *(ompi_coll_base_nbc_request_t **)rptr;
219+
int rc = OMPI_SUCCESS;
220+
if (NULL != request->cb.req_free) {
221+
rc = request->cb.req_free(rptr);
222+
}
223+
release_vecs_callback(request);
224+
return rc;
225+
}
226+
227+
int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
228+
ompi_datatype_t *stypes[], ompi_datatype_t *rtypes[]) {
229+
ompi_coll_base_nbc_request_t *request = (ompi_coll_base_nbc_request_t *)req;
230+
bool retain = false;
231+
ompi_communicator_t *comm = request->super.req_mpi_object.comm;
232+
int count = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
233+
234+
for (int i=0; i<count; i++) {
235+
if (NULL != stypes && NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) {
236+
OBJ_RETAIN(stypes[i]);
237+
retain = true;
238+
}
239+
if (NULL != rtypes && NULL != rtypes[i] && !ompi_datatype_is_predefined(rtypes[i])) {
240+
OBJ_RETAIN(rtypes[i]);
241+
retain = true;
242+
}
243+
}
244+
if (OPAL_UNLIKELY(retain)) {
245+
request->data.vecs.stypes = stypes;
246+
request->data.vecs.rtypes = rtypes;
247+
if (req->req_persistent) {
248+
request->cb.req_free = req->req_free;
249+
req->req_free = free_vecs_callback;
250+
} else {
251+
request->cb.req_complete_cb = req->req_complete_cb;
252+
request->req_complete_cb_data = req->req_complete_cb_data;
253+
req->req_complete_cb = complete_vecs_callback;
254+
req->req_complete_cb_data = request;
255+
}
256+
}
257+
return OMPI_SUCCESS;
258+
}
259+
260+
static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) {
261+
req->cb.req_complete_cb = NULL;
262+
req->req_complete_cb_data = NULL;
263+
req->data.objs.objs[0] = NULL;
264+
req->data.objs.objs[1] = NULL;
265+
}
266+
267+
OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL);

ompi/mca/coll/base/coll_base_util.h

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2017 Research Organization for Information Science
13-
* and Technology (RIST). All rights reserved.
12+
* Copyright (c) 2014-2019 Research Organization for Information Science
13+
* and Technology (RIST). All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -27,10 +27,41 @@
2727
#include "ompi/mca/mca.h"
2828
#include "ompi/datatype/ompi_datatype.h"
2929
#include "ompi/request/request.h"
30+
#include "ompi/op/op.h"
3031
#include "ompi/mca/pml/pml.h"
3132

3233
BEGIN_C_DECLS
3334

35+
struct ompi_coll_base_nbc_request_t {
36+
ompi_request_t super;
37+
union {
38+
ompi_request_complete_fn_t req_complete_cb;
39+
ompi_request_free_fn_t req_free;
40+
} cb;
41+
void *req_complete_cb_data;
42+
union {
43+
struct {
44+
ompi_op_t *op;
45+
ompi_datatype_t *datatype;
46+
} op;
47+
struct {
48+
ompi_datatype_t *stype;
49+
ompi_datatype_t *rtype;
50+
} types;
51+
struct {
52+
opal_object_t *objs[2];
53+
} objs;
54+
struct {
55+
ompi_datatype_t **stypes;
56+
ompi_datatype_t **rtypes;
57+
} vecs;
58+
} data;
59+
};
60+
61+
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_coll_base_nbc_request_t);
62+
63+
typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t;
64+
3465
/**
3566
* A MPI_like function doing a send and a receive simultaneously.
3667
* If one of the communications results in a zero-byte message the
@@ -84,5 +115,17 @@ unsigned int ompi_mirror_perm(unsigned int x, int nbits);
84115
*/
85116
int ompi_rounddown(int num, int factor);
86117

118+
int ompi_coll_base_retain_op( ompi_request_t *request,
119+
ompi_op_t *op,
120+
ompi_datatype_t *type);
121+
122+
int ompi_coll_base_retain_datatypes( ompi_request_t *request,
123+
ompi_datatype_t *stype,
124+
ompi_datatype_t *rtype);
125+
126+
int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
127+
ompi_datatype_t *stypes[],
128+
ompi_datatype_t *rtypes[]);
129+
87130
END_C_DECLS
88131
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

ompi/mca/coll/libnbc/coll_libnbc.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2014-2017 Research Organization for Information Science
17-
* and Technology (RIST). All rights reserved.
16+
* Copyright (c) 2014-2019 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
1919
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
2020
* $COPYRIGHT$
@@ -28,7 +28,7 @@
2828
#define MCA_COLL_LIBNBC_EXPORT_H
2929

3030
#include "ompi/mca/coll/coll.h"
31-
#include "ompi/request/request.h"
31+
#include "ompi/mca/coll/base/coll_base_util.h"
3232
#include "opal/sys/atomic.h"
3333

3434
BEGIN_C_DECLS
@@ -121,7 +121,7 @@ typedef struct NBC_Schedule NBC_Schedule;
121121
OBJ_CLASS_DECLARATION(NBC_Schedule);
122122

123123
struct ompi_coll_libnbc_request_t {
124-
ompi_request_t super;
124+
ompi_coll_base_nbc_request_t super;
125125
MPI_Comm comm;
126126
long row_offset;
127127
bool nbc_complete; /* status in libnbc level */
@@ -145,13 +145,13 @@ typedef ompi_coll_libnbc_request_t NBC_Handle;
145145
opal_free_list_item_t *item; \
146146
item = opal_free_list_wait (&mca_coll_libnbc_component.requests); \
147147
req = (ompi_coll_libnbc_request_t*) item; \
148-
OMPI_REQUEST_INIT(&req->super, persistent); \
149-
req->super.req_mpi_object.comm = comm; \
148+
OMPI_REQUEST_INIT(&req->super.super, persistent); \
149+
req->super.super.req_mpi_object.comm = comm; \
150150
} while (0)
151151

152152
#define OMPI_COLL_LIBNBC_REQUEST_RETURN(req) \
153153
do { \
154-
OMPI_REQUEST_FINI(&(req)->super); \
154+
OMPI_REQUEST_FINI(&(req)->super.super); \
155155
opal_free_list_return (&mca_coll_libnbc_component.requests, \
156156
(opal_free_list_item_t*) (req)); \
157157
} while (0)

ompi/mca/coll/libnbc/coll_libnbc_component.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2016-2017 Research Organization for Information Science
17-
* and Technology (RIST). All rights reserved.
16+
* Copyright (c) 2016-2019 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1919
* Copyright (c) 2017 Ian Bradley Morgan and Anthony Skjellum. All
2020
* rights reserved.
@@ -448,21 +448,21 @@ ompi_coll_libnbc_progress(void)
448448
/* done, remove and complete */
449449
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
450450
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
451-
&request->super.super.super);
451+
&request->super.super.super.super);
452452
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
453453

454454
if( OMPI_SUCCESS == res || NBC_OK == res || NBC_SUCCESS == res ) {
455-
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
455+
request->super.super.req_status.MPI_ERROR = OMPI_SUCCESS;
456456
}
457457
else {
458-
request->super.req_status.MPI_ERROR = res;
458+
request->super.super.req_status.MPI_ERROR = res;
459459
}
460-
if(request->super.req_persistent) {
460+
if(request->super.super.req_persistent) {
461461
/* reset for the next communication */
462462
request->row_offset = 0;
463463
}
464-
if(!request->super.req_persistent || !REQUEST_COMPLETE(&request->super)) {
465-
ompi_request_complete(&request->super, true);
464+
if(!request->super.super.req_persistent || !REQUEST_COMPLETE(&request->super.super)) {
465+
ompi_request_complete(&request->super.super, true);
466466
}
467467
}
468468
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
@@ -527,7 +527,7 @@ request_start(size_t count, ompi_request_t ** requests)
527527
NBC_DEBUG(5, "tmpbuf address=%p size=%u\n", handle->tmpbuf, sizeof(handle->tmpbuf));
528528
NBC_DEBUG(5, "--------------------------------\n");
529529

530-
handle->super.req_complete = REQUEST_PENDING;
530+
handle->super.super.req_complete = REQUEST_PENDING;
531531
handle->nbc_complete = false;
532532

533533
res = NBC_Start(handle);
@@ -557,7 +557,7 @@ request_free(struct ompi_request_t **ompi_req)
557557
ompi_coll_libnbc_request_t *request =
558558
(ompi_coll_libnbc_request_t*) *ompi_req;
559559

560-
if( !REQUEST_COMPLETE(&request->super) ) {
560+
if( !REQUEST_COMPLETE(&request->super.super) ) {
561561
return MPI_ERR_REQUEST;
562562
}
563563

@@ -571,11 +571,11 @@ request_free(struct ompi_request_t **ompi_req)
571571
static void
572572
request_construct(ompi_coll_libnbc_request_t *request)
573573
{
574-
request->super.req_type = OMPI_REQUEST_COLL;
575-
request->super.req_status._cancelled = 0;
576-
request->super.req_start = request_start;
577-
request->super.req_free = request_free;
578-
request->super.req_cancel = request_cancel;
574+
request->super.super.req_type = OMPI_REQUEST_COLL;
575+
request->super.super.req_status._cancelled = 0;
576+
request->super.super.req_start = request_start;
577+
request->super.super.req_free = request_free;
578+
request->super.super.req_cancel = request_cancel;
579579
}
580580

581581

0 commit comments

Comments
 (0)