Skip to content

Commit 447b289

Browse files
committed
Reduce the amount of temporary memory needed for MPI_Alltoallw.
Dont copy the datatype into a buffer with the same extent, but instead pack it and send it to the peer as packed. Signed-off-by: George Bosilca <[email protected]>
1 parent 74049fc commit 447b289

File tree

1 file changed

+28
-25
lines changed

1 file changed

+28
-25
lines changed

ompi/mca/coll/basic/coll_basic_alltoallw.c

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "mpi.h"
3232
#include "ompi/constants.h"
3333
#include "ompi/datatype/ompi_datatype.h"
34+
#include "opal/datatype/opal_convertor_internal.h"
3435
#include "ompi/mca/coll/coll.h"
3536
#include "ompi/mca/coll/base/coll_tags.h"
3637
#include "ompi/mca/pml/pml.h"
@@ -42,12 +43,11 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con
4243
struct ompi_communicator_t *comm,
4344
mca_coll_base_module_t *module)
4445
{
45-
int i, j, size, rank, err = MPI_SUCCESS, max_size;
46+
int i, j, size, rank, err = MPI_SUCCESS;
4647
ompi_request_t *req;
4748
char *save_buffer = NULL;
48-
ptrdiff_t ext, gap = 0;
49-
50-
/* Initialize. */
49+
size_t max_size = 0, packed_size;
50+
opal_convertor_t convertor;
5151

5252
size = ompi_comm_size(comm);
5353
rank = ompi_comm_rank(comm);
@@ -57,11 +57,14 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con
5757
return MPI_SUCCESS;
5858
}
5959

60-
/* Find the largest receive amount */
60+
/* Find the largest amount of packed send/recv data */
6161
for (i = 0, max_size = 0 ; i < size ; ++i) {
62-
ext = opal_datatype_span(&rdtypes[i]->super, rcounts[i], &gap);
62+
ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, i);
6363

64-
max_size = ext > max_size ? ext : max_size;
64+
packed_size = opal_datatype_compute_remote_size(&rdtypes[i]->super,
65+
ompi_proc->super.proc_convertor->master->remote_sizes);
66+
packed_size *= rcounts[i];
67+
max_size = packed_size > max_size ? packed_size : max_size;
6568
}
6669

6770
/* Allocate a temporary buffer */
@@ -77,45 +80,45 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con
7780
msg_size_i *= rcounts[i];
7881
for (j = i+1 ; j < size ; ++j) {
7982
size_t msg_size_j;
83+
struct iovec iov = {.iov_base = save_buffer, .iov_len = max_size};
84+
uint32_t iov_count = 1;
8085
ompi_datatype_type_size(rdtypes[j], &msg_size_j);
8186
msg_size_j *= rcounts[j];
8287

8388
/* Initiate all send/recv to/from others. */
8489
if (i == rank && msg_size_j != 0) {
85-
char * tmp_buffer;
86-
/* Shift the temporary buffer according to the current datatype */
87-
(void)opal_datatype_span(&rdtypes[j]->super, rcounts[j], &gap);
88-
tmp_buffer = save_buffer - gap;
89-
/* Copy the data into the temporary buffer */
90-
err = ompi_datatype_copy_content_same_ddt (rdtypes[j], rcounts[j],
91-
tmp_buffer, (char *) rbuf + rdisps[j]);
92-
if (MPI_SUCCESS != err) { goto error_hndl; }
90+
ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, j);
91+
opal_convertor_clone(&convertor, ompi_proc->super.proc_convertor, 0);
92+
opal_convertor_prepare_for_send(&convertor, &rdtypes[j]->super, rcounts[j],
93+
(char *) rbuf + rdisps[j]);
94+
packed_size = max_size;
95+
err = opal_convertor_pack(&convertor, &iov, &iov_count, &packed_size);
96+
if (1 != err) { goto error_hndl; }
9397

9498
/* Exchange data with the peer */
9599
err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j], rcounts[j], rdtypes[j],
96100
j, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req));
97101
if (MPI_SUCCESS != err) { goto error_hndl; }
98102

99-
err = MCA_PML_CALL(send ((void *) tmp_buffer, rcounts[j], rdtypes[j],
103+
err = MCA_PML_CALL(send ((void *) save_buffer, packed_size, MPI_PACKED,
100104
j, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD,
101105
comm));
102106
if (MPI_SUCCESS != err) { goto error_hndl; }
103107
} else if (j == rank && msg_size_i != 0) {
104-
char * tmp_buffer;
105-
/* Shift the temporary buffer according to the current datatype */
106-
(void)opal_datatype_span(&rdtypes[i]->super, rcounts[i], &gap);
107-
tmp_buffer = save_buffer - gap;
108-
/* Copy the data into the temporary buffer */
109-
err = ompi_datatype_copy_content_same_ddt (rdtypes[i], rcounts[i],
110-
tmp_buffer, (char *) rbuf + rdisps[i]);
111-
if (MPI_SUCCESS != err) { goto error_hndl; }
108+
ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, i);
109+
opal_convertor_clone(&convertor, ompi_proc->super.proc_convertor, 0);
110+
opal_convertor_prepare_for_send(&convertor, &rdtypes[i]->super, rcounts[i],
111+
(char *) rbuf + rdisps[i]);
112+
packed_size = max_size;
113+
err = opal_convertor_pack(&convertor, &iov, &iov_count, &packed_size);
114+
if (1 != err) { goto error_hndl; }
112115

113116
/* Exchange data with the peer */
114117
err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i], rcounts[i], rdtypes[i],
115118
i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req));
116119
if (MPI_SUCCESS != err) { goto error_hndl; }
117120

118-
err = MCA_PML_CALL(send ((void *) tmp_buffer, rcounts[i], rdtypes[i],
121+
err = MCA_PML_CALL(send ((void *) save_buffer, packed_size, MPI_PACKED,
119122
i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD,
120123
comm));
121124
if (MPI_SUCCESS != err) { goto error_hndl; }

0 commit comments

Comments
 (0)