Skip to content

Commit aa17392

Browse files
committed
Optimize the pack/unpack.
Start optimizing the code. This commit divides the operations in 2 parts, the first, outside the critical part, deals with partial blocks of predefined elements, and the second, inside the critical path, only deals with full blocks of elements. This reduces the number of expensive operations in the critical path and results in a decent performance increase. Signed-off-by: George Bosilca <[email protected]>
1 parent 3562d70 commit aa17392

File tree

4 files changed

+238
-149
lines changed

4 files changed

+238
-149
lines changed

opal/datatype/opal_datatype_pack.c

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -272,18 +272,32 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
272272
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
273273
iov_ptr = (unsigned char *) iov[iov_count].iov_base;
274274
iov_len_local = iov[iov_count].iov_len;
275-
while( 1 ) {
276-
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
277-
/* now here we have a basic datatype */
278-
PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
279-
conv_ptr, iov_ptr, iov_len_local );
280-
if( 0 == count_desc ) { /* completed */
275+
276+
if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
277+
if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) {
278+
/* we have a partial (less than blocklen) basic datatype */
279+
int rc = PACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc,
280+
conv_ptr, iov_ptr, iov_len_local );
281+
if( 0 == rc ) /* not done */
282+
goto complete_loop;
283+
if( 0 == count_desc ) {
281284
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
282285
pos_desc++; /* advance to the next data */
283286
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
284-
continue;
285287
}
286-
goto complete_loop;
288+
}
289+
}
290+
291+
while( 1 ) {
292+
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
293+
/* we have a basic datatype (working on full blocks) */
294+
PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
295+
conv_ptr, iov_ptr, iov_len_local );
296+
if( 0 != count_desc ) /* completed? */
297+
goto complete_loop;
298+
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
299+
pos_desc++; /* advance to the next data */
300+
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
287301
}
288302
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
289303
DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d"

opal/datatype/opal_datatype_pack.h

Lines changed: 84 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,63 @@
2626
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
2727
#endif
2828

29+
/**
30+
* This function deals only with partial elements. The COUNT points however to the whole leftover count,
31+
* but this function is only expected to operate on an amount less than blength, that would allow the rest
32+
* of the pack process to handle only entire blength blocks (plus the left over).
33+
*
34+
* Return 1 if we are now aligned on a block, 0 otherwise.
35+
*/
36+
static inline int
37+
pack_partial_blocklen( opal_convertor_t* CONVERTOR,
38+
const dt_elem_desc_t* ELEM,
39+
size_t* COUNT,
40+
unsigned char** memory,
41+
unsigned char** packed,
42+
size_t* SPACE )
43+
{
44+
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
45+
size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
46+
size_t do_now = *(COUNT);
47+
unsigned char* _memory = (*memory) + _elem->disp;
48+
unsigned char* _packed = *packed;
49+
50+
assert( *(COUNT) <= _elem->count * _elem->blocklen);
51+
52+
/**
53+
* First check if we already did something on this element ? The COUNT is the number
54+
* of remaining predefined types in the current elem, not how many predefined types
55+
* should be manipulated in the current call (this number is instead reflected on the
56+
* SPACE).
57+
*/
58+
if( 0 == (do_now = (*COUNT) % _elem->blocklen) )
59+
return 1;
60+
61+
size_t left_in_block = do_now; /* left in the current blocklen */
62+
63+
if( (do_now_bytes * do_now) > *(SPACE) )
64+
do_now = (*SPACE) / do_now_bytes;
65+
66+
do_now_bytes *= do_now;
67+
68+
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
69+
(CONVERTOR)->pDesc, (CONVERTOR)->count );
70+
DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [partial]\n",
71+
_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
72+
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
73+
*(memory) += (ptrdiff_t)do_now_bytes;
74+
if( do_now == left_in_block ) /* compensate if completed a blocklen */
75+
*(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
76+
77+
*(COUNT) -= do_now;
78+
*(SPACE) -= do_now_bytes;
79+
*(packed) += do_now_bytes;
80+
return (do_now == left_in_block);
81+
}
82+
83+
/**
84+
* Pack entire blocks, plus a possible remainder if SPACE is constrained to less than COUNT elements.
85+
*/
2986
static inline void
3087
pack_predefined_data( opal_convertor_t* CONVERTOR,
3188
const dt_elem_desc_t* ELEM,
@@ -36,27 +93,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
3693
{
3794
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
3895
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
39-
size_t cando_count = *(COUNT), do_now, do_now_bytes;
96+
size_t cando_count = *(COUNT), do_now_bytes;
4097
unsigned char* _memory = (*memory) + _elem->disp;
4198
unsigned char* _packed = *packed;
4299

100+
assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */
43101
assert( *(COUNT) <= _elem->count * _elem->blocklen);
44102

45103
if( (blocklen_bytes * cando_count) > *(SPACE) )
46104
cando_count = (*SPACE) / blocklen_bytes;
47105

48-
do_now = *(COUNT); /* save the COUNT for later */
49106
/* premptively update the number of COUNT we will return. */
50107
*(COUNT) -= cando_count;
51108

52-
if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
53-
goto do_epilog;
54-
}
55109
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
56110
for(; cando_count > 0; cando_count--) {
57111
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
58112
(CONVERTOR)->pDesc, (CONVERTOR)->count );
59-
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
113+
DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n",
60114
(void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
61115
MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
62116
_packed += blocklen_bytes;
@@ -65,61 +119,32 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
65119
goto update_and_return;
66120
}
67121

68-
blocklen_bytes *= _elem->blocklen;
69-
if( (_elem->count * _elem->blocklen) == cando_count ) {
70-
goto skip_prolog;
71-
}
72-
/**
73-
* First check if we already did something on this element ? The COUNT is the number
74-
* of remaining predefined types in the current elem, not how many predefined types
75-
* should be manipulated in the current call (this number is instead reflected on the
76-
* SPACE).
77-
*/
78-
do_now = do_now % _elem->blocklen; /* any partial elements ? */
122+
if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) {
123+
blocklen_bytes *= _elem->blocklen;
79124

80-
if( 0 != do_now ) {
81-
size_t left_in_block = do_now; /* left in the current blocklen */
82-
do_now = (do_now > cando_count ) ? cando_count : do_now;
83-
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
84-
85-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
86-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
87-
DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n",
88-
_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
89-
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
90-
_memory += (ptrdiff_t)do_now_bytes;
91-
/* compensate if we just completed a blocklen */
92-
if( do_now == left_in_block )
93-
_memory += _elem->extent - blocklen_bytes;
94-
_packed += do_now_bytes;
95-
cando_count -= do_now;
96-
}
97-
98-
skip_prolog:
99-
/* Do as many full blocklen as possible */
100-
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
101-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
102-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
103-
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
104-
(void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
105-
MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
106-
_packed += blocklen_bytes;
107-
_memory += _elem->extent;
108-
cando_count -= _elem->blocklen;
125+
do { /* Do as many full blocklen as possible */
126+
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
127+
(CONVERTOR)->pDesc, (CONVERTOR)->count );
128+
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
129+
(void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
130+
MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
131+
_packed += blocklen_bytes;
132+
_memory += _elem->extent;
133+
cando_count -= _elem->blocklen;
134+
} while (_elem->blocklen <= cando_count);
109135
}
110136

111137
/**
112138
* As an epilog do anything left from the last blocklen.
113139
*/
114140
if( 0 != cando_count ) {
115-
116-
do_epilog:
117-
assert( cando_count < _elem->blocklen );
141+
assert( (cando_count < _elem->blocklen) ||
142+
((1 == _elem->count) && (cando_count <= _elem->blocklen)) );
118143
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
119144
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
120145
(CONVERTOR)->pDesc, (CONVERTOR)->count );
121146
DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n",
122-
(void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
147+
(void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
123148
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
124149
_memory += do_now_bytes;
125150
_packed += do_now_bytes;
@@ -159,7 +184,15 @@ static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
159184
*(COUNT) -= _copy_loops;
160185
}
161186

162-
#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \
187+
#define PACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \
188+
ELEM, /* the basic element to be packed */ \
189+
COUNT, /* the number of elements */ \
190+
MEMORY, /* the source pointer (char*) */ \
191+
PACKED, /* the destination pointer (char*) */ \
192+
SPACE ) /* the space in the destination buffer */ \
193+
pack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) )
194+
195+
#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \
163196
ELEM, /* the basic element to be packed */ \
164197
COUNT, /* the number of elements */ \
165198
MEMORY, /* the source pointer (char*) */ \

opal/datatype/opal_datatype_unpack.c

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
282282
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
283283
iov_ptr = (unsigned char *) iov[iov_count].iov_base;
284284
iov_len_local = iov[iov_count].iov_len;
285+
285286
if( 0 != pConvertor->partial_length ) {
286287
size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
287288
size_t missing_length = element_length - pConvertor->partial_length;
@@ -302,46 +303,41 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
302303
iov_len_local -= missing_length;
303304
pConvertor->partial_length = 0; /* nothing more inside */
304305
}
305-
while( 1 ) {
306-
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
307-
/* now here we have a basic datatype */
308-
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
309-
iov_ptr, conv_ptr, iov_len_local );
310-
if( 0 == count_desc ) { /* completed */
306+
if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
307+
if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) {
308+
/* we have a partial (less than blocklen) basic datatype */
309+
int rc = UNPACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc,
310+
iov_ptr, conv_ptr, iov_len_local );
311+
if( 0 == rc ) /* not done */
312+
goto complete_loop;
313+
if( 0 == count_desc ) {
311314
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
312315
pos_desc++; /* advance to the next data */
313316
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
314-
continue;
315-
}
316-
assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
317-
if( 0 != iov_len_local ) {
318-
unsigned char* temp = conv_ptr;
319-
/* We have some partial data here. Let's copy it into the convertor
320-
* and keep it hot until the next round.
321-
*/
322-
assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
323-
COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
324-
325-
opal_unpack_partial_datatype( pConvertor, pElem,
326-
iov_ptr, 0, iov_len_local,
327-
&temp );
328-
329-
pConvertor->partial_length = iov_len_local;
330-
iov_len_local = 0;
331317
}
332-
goto complete_loop;
318+
}
319+
}
320+
321+
while( 1 ) {
322+
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
323+
/* we have a basic datatype (working on full blocks) */
324+
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
325+
iov_ptr, conv_ptr, iov_len_local );
326+
if( 0 != count_desc ) /* completed? */
327+
goto complete_loop;
328+
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
329+
pos_desc++; /* advance to the next data */
330+
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
333331
}
334332
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
335333
DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n",
336334
pStack->count, pConvertor->stack_pos, pos_desc,
337335
pStack->disp, (unsigned long)iov_len_local ); );
338336
if( --(pStack->count) == 0 ) { /* end of loop */
339337
if( 0 == pConvertor->stack_pos ) {
340-
/* Do the same thing as when the loop is completed */
341-
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
342-
total_unpacked += iov[iov_count].iov_len;
343-
iov_count++; /* go to the next */
344-
goto complete_conversion;
338+
/* we're done. Force the exit of the main for loop (around iovec) */
339+
*out_size = iov_count;
340+
goto complete_loop;
345341
}
346342
pConvertor->stack_pos--;
347343
pStack--;
@@ -380,14 +376,29 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
380376
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
381377
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
382378
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
383-
continue;
384379
}
385380
}
386381
complete_loop:
382+
assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
383+
if( 0 != iov_len_local ) {
384+
unsigned char* temp = conv_ptr;
385+
/* We have some partial data here. Let's copy it into the convertor
386+
* and keep it hot until the next round.
387+
*/
388+
assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
389+
COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
390+
391+
opal_unpack_partial_datatype( pConvertor, pElem,
392+
iov_ptr, 0, iov_len_local,
393+
&temp );
394+
395+
pConvertor->partial_length = iov_len_local;
396+
iov_len_local = 0;
397+
}
398+
387399
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
388400
total_unpacked += iov[iov_count].iov_len;
389401
}
390-
complete_conversion:
391402
*max_data = total_unpacked;
392403
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
393404
*out_size = iov_count;
@@ -514,11 +525,9 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
514525
pStack->disp, (unsigned long)iov_len_local ); );
515526
if( --(pStack->count) == 0 ) { /* end of loop */
516527
if( 0 == pConvertor->stack_pos ) {
517-
/* Do the same thing as when the loop is completed */
518-
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
519-
total_unpacked += iov[iov_count].iov_len;
520-
iov_count++; /* go to the next */
521-
goto complete_conversion;
528+
/* we're done. Force the exit of the main for loop (around iovec) */
529+
*out_size = iov_count;
530+
goto complete_loop;
522531
}
523532
pConvertor->stack_pos--;
524533
pStack--;
@@ -552,7 +561,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
552561
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
553562
total_unpacked += iov[iov_count].iov_len;
554563
}
555-
complete_conversion:
556564
*max_data = total_unpacked;
557565
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
558566
*out_size = iov_count;

0 commit comments

Comments
 (0)