Skip to content

Commit dbe69f9

Browse files
committed
Optimize the packing of contiguous datatypes with gaps.
Signed-off-by: George Bosilca <[email protected]>
1 parent e1b06bc commit dbe69f9

File tree

1 file changed

+94
-129
lines changed

1 file changed

+94
-129
lines changed

opal/datatype/opal_datatype_pack.c

Lines changed: 94 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2019 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -53,8 +53,6 @@
5353
#endif /* defined(CHECKSUM) */
5454

5555

56-
#define IOVEC_MEM_LIMIT 8192
57-
5856
/* the contig versions does not use the stack. They can easily retrieve
5957
* the status with just the informations from pConvertor->bConverted.
6058
*/
@@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
6866
unsigned char *source_base = NULL;
6967
uint32_t iov_count;
7068
size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted;
71-
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
7269

73-
source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp);
70+
source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp);
7471

7572
/* There are some optimizations that can be done if the upper level
7673
* does not provide a buffer.
@@ -111,155 +108,123 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
111108
uint32_t* out_size,
112109
size_t* max_data )
113110
{
111+
size_t remaining, length, initial_bytes_converted = pConv->bConverted;
114112
const opal_datatype_t* pData = pConv->pDesc;
115113
dt_stack_t* stack = pConv->pStack;
114+
ptrdiff_t extent = pData->ub - pData->lb;
116115
unsigned char *user_memory, *packed_buffer;
117-
uint32_t iov_count, index;
116+
uint32_t idx = 0;
118117
size_t i;
119-
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted;
120-
ptrdiff_t extent= pData->ub - pData->lb;
121-
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
122118

119+
/* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
120+
* is the initial displacement, the size the length of the contiguous area and the extent represent
121+
* how much we should jump between elements.
122+
*/
123123
assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) );
124124
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
125125
(void*)pConv->pBaseBuf, *out_size ); );
126126
if( stack[1].type != opal_datatype_uint1.id ) {
127127
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
128128
stack[1].type = opal_datatype_uint1.id;
129129
}
130+
/* We can provide directly the pointers in the user buffers (like the convertor_raw) */
131+
if( NULL == iov[0].iov_base ) {
132+
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
133+
if( stack[1].count != pData->size ) {
134+
iov[idx].iov_base = user_memory;
135+
iov[idx].iov_len = stack[1].count;
136+
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
137+
stack[0].count--; /* update the first stack position */
138+
stack[0].disp += extent;
139+
stack[1].count = pData->size; /* for safety */
140+
stack[1].disp = 0;
141+
idx++; /* update next iovec */
142+
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp;
143+
pConv->bConverted += stack[1].count;
144+
}
145+
for( ; (idx < (*out_size)) && stack[0].count; idx++ ) {
146+
iov[idx].iov_base = user_memory;
147+
iov[idx].iov_len = pData->size;
148+
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
149+
stack[0].count--;
150+
stack[0].disp += extent;
151+
user_memory += extent;
152+
pConv->bConverted += pData->size;
153+
}
154+
goto update_status_and_return;
155+
}
130156

131-
/* There are some optimizations that can be done if the upper level
132-
* does not provide a buffer.
133-
*/
134-
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
157+
for( idx = 0; idx < (*out_size); idx++ ) {
135158
/* Limit the amount of packed data to the data left over on this convertor */
136159
remaining = pConv->local_size - pConv->bConverted;
137160
if( 0 == remaining ) break; /* we're done this time */
138-
if( remaining > iov[iov_count].iov_len )
139-
remaining = iov[iov_count].iov_len;
140-
packed_buffer = (unsigned char *)iov[iov_count].iov_base;
141-
bConverted = remaining; /* how much will get unpacked this time */
142-
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp;
143-
i = pConv->count - stack[0].count; /* how many we already packed */
144-
assert(i == (pConv->bConverted / pData->size));
145-
146-
if( packed_buffer == NULL ) {
147-
/* special case for small data. We avoid allocating memory if we
148-
* can fill the iovec directly with the address of the remaining
149-
* data.
150-
*/
151-
if( stack->count < (size_t)((*out_size) - iov_count) ) {
152-
stack[1].count = pData->size - (pConv->bConverted % pData->size);
153-
for( index = iov_count; i < pConv->count; i++, index++ ) {
154-
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
155-
iov[index].iov_len = stack[1].count;
156-
stack[0].disp += extent;
157-
pConv->bConverted += stack[1].count;
158-
stack[1].disp = 0; /* reset it for the next round */
159-
stack[1].count = pData->size;
160-
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp;
161-
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
162-
}
163-
*out_size = iov_count + index;
164-
*max_data = (pConv->bConverted - initial_bytes_converted);
165-
pConv->flags |= CONVERTOR_COMPLETED;
166-
return 1; /* we're done */
167-
}
168-
/* now special case for big contiguous data with gaps around */
169-
if( pData->size >= IOVEC_MEM_LIMIT ) {
170-
/* as we dont have to copy any data, we can simply fill the iovecs
171-
* with data from the user data description.
172-
*/
173-
for( index = iov_count; (i < pConv->count) && (index < (*out_size));
174-
i++, index++ ) {
175-
if( remaining < pData->size ) {
176-
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
177-
iov[index].iov_len = remaining;
178-
remaining = 0;
179-
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
180-
break;
181-
} else {
182-
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
183-
iov[index].iov_len = pData->size;
184-
user_memory += extent;
185-
COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
186-
}
187-
remaining -= iov[index].iov_len;
188-
pConv->bConverted += iov[index].iov_len;
189-
}
190-
*out_size = index;
191-
*max_data = (pConv->bConverted - initial_bytes_converted);
192-
if( pConv->bConverted == pConv->local_size ) {
193-
pConv->flags |= CONVERTOR_COMPLETED;
194-
return 1;
195-
}
196-
return 0;
161+
if( remaining > iov[idx].iov_len )
162+
remaining = iov[idx].iov_len;
163+
packed_buffer = (unsigned char *)iov[idx].iov_base;
164+
pConv->bConverted += remaining;
165+
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
166+
167+
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n",
168+
(void*)user_memory, (void*)packed_buffer, remaining ); );
169+
170+
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
171+
/* data left from last round and enough space in the buffer */
172+
if( (pData->size != length) && (length <= remaining)) {
173+
/* copy the partial left-over from the previous round */
174+
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
175+
pData, pConv->count );
176+
DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n",
177+
(void*)user_memory, (void*)packed_buffer, length ); );
178+
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
179+
packed_buffer += length;
180+
remaining -= length;
181+
stack[1].count -= length;
182+
stack[1].disp += length; /* just in case, we overwrite this below */
183+
if( 0 == stack[1].count) { /* one completed element */
184+
stack[0].count--;
185+
stack[0].disp += extent;
186+
if( 0 == stack[0].count ) /* not yet done */
187+
break;
188+
stack[1].count = pData->size;
189+
stack[1].disp = 0;
197190
}
191+
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
198192
}
199193

200-
{
201-
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
202-
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
203-
204-
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
205-
/* data left from last round and enough space in the buffer */
206-
if( (0 != length) && (length <= remaining)) {
207-
/* copy the partial left-over from the previous round */
208-
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
209-
pData, pConv->count );
210-
DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n",
211-
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); );
212-
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
213-
packed_buffer += length;
214-
user_memory += (extent - pData->size + length);
215-
remaining -= length;
216-
stack[1].count -= length;
217-
if( 0 == stack[1].count) { /* one completed element */
218-
stack[0].count--;
219-
stack[0].disp += extent;
220-
if( 0 != stack[0].count ) { /* not yet done */
221-
stack[1].count = pData->size;
222-
stack[1].disp = 0;
223-
}
224-
}
225-
}
226-
for( i = 0; pData->size <= remaining; i++ ) {
227-
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
228-
pData, pConv->count );
229-
DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n",
230-
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); );
231-
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
232-
packed_buffer += pData->size;
233-
user_memory += extent;
234-
remaining -= pData->size;
235-
}
236-
stack[0].count -= i; /* the filled up and the entire types */
237-
stack[0].disp += (i * extent);
238-
stack[1].disp += remaining;
239-
/* Copy the last bits */
240-
if( 0 != remaining ) {
241-
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
242-
pData, pConv->count );
243-
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n",
244-
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
245-
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
246-
user_memory += remaining;
247-
stack[1].count -= remaining;
248-
}
194+
for( i = 0; pData->size <= remaining; i++ ) {
195+
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
196+
pData, pConv->count );
197+
DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n",
198+
(void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); );
199+
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
200+
packed_buffer += pData->size;
201+
user_memory += extent;
202+
remaining -= pData->size;
203+
}
204+
stack[0].count -= i; /* the entire datatype copied above */
205+
stack[0].disp += (i * extent);
206+
207+
/* Copy the last bits */
208+
if( 0 != remaining ) {
209+
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
210+
pData, pConv->count );
211+
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n",
212+
(void*)user_memory, (void*)packed_buffer, remaining ); );
213+
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
214+
stack[1].count -= remaining;
215+
stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
249216
if( 0 == stack[1].count ) { /* prepare for the next element */
250217
stack[1].count = pData->size;
251218
stack[1].disp = 0;
252219
}
253220
}
254-
pConv->bConverted += bConverted;
255221
}
256-
*out_size = iov_count;
257-
*max_data = (pConv->bConverted - initial_bytes_converted);
258-
if( pConv->bConverted == pConv->local_size ) {
259-
pConv->flags |= CONVERTOR_COMPLETED;
260-
return 1;
261-
}
262-
return 0;
222+
223+
update_status_and_return:
224+
*out_size = idx;
225+
*max_data = pConv->bConverted - initial_bytes_converted;
226+
if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
227+
return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
263228
}
264229

265230
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access

0 commit comments

Comments
 (0)