Skip to content

Commit 2324c63

Browse files
committed
Clean the pack and unpack functions.
This fixes a performance issue for all datatypes with a count of 1. Signed-off-by: George Bosilca <[email protected]>
1 parent 86eda65 commit 2324c63

6 files changed

+134
-159
lines changed

opal/datatype/opal_datatype_copy.h

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,9 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
5151
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
5252
unsigned char* _source = (SOURCE) + _elem->disp;
5353
unsigned char* _destination = (DESTINATION) + _elem->disp;
54-
size_t total_count = _elem->count * _elem->blocklen;
55-
size_t do_now, do_now_bytes;
54+
size_t do_now = _elem->count, do_now_bytes;
5655

57-
assert( (COUNT) == total_count);
58-
assert( total_count <= ((*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size) );
56+
assert( (COUNT) == (do_now * _elem->blocklen));
5957

6058
/* We don't a prologue and epilogue here as we are __always__ working
6159
* with full copies of the data description.
@@ -64,21 +62,19 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
6462
/**
6563
* Compute how many full blocklen we need to do and do them.
6664
*/
67-
do_now = _elem->count;
68-
if( 0 != do_now ) {
69-
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
70-
for(size_t _i = 0; _i < do_now; _i++ ) {
71-
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE),
72-
(DATATYPE), (TOTAL_COUNT) );
73-
DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
74-
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) ); );
75-
MEM_OP( _destination, _source, do_now_bytes );
76-
_destination += _elem->extent;
77-
_source += _elem->extent;
78-
*(SPACE) -= do_now_bytes;
79-
}
80-
(COUNT) -= total_count;
65+
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
66+
assert( (do_now * do_now_bytes) <= (*SPACE) );
67+
68+
for(size_t _i = 0; _i < do_now; _i++ ) {
69+
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE),
70+
(DATATYPE), (TOTAL_COUNT) );
71+
DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
72+
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); );
73+
MEM_OP( _destination, _source, do_now_bytes );
74+
_destination += _elem->extent;
75+
_source += _elem->extent;
8176
}
77+
*(SPACE) -= (do_now_bytes * do_now);
8278
}
8379

8480
static inline void _contiguous_loop( const dt_elem_desc_t* ELEM,

opal/datatype/opal_datatype_module.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@ int32_t opal_datatype_init( void )
252252
OPAL_DATATYPE_FLAG_CONTIGUOUS |
253253
OPAL_DATATYPE_FLAG_NO_GAPS;
254254
datatype->desc.desc[0].elem.common.type = i;
255-
/* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */
256255
datatype->desc.desc[0].elem.count = 1;
257256
datatype->desc.desc[0].elem.blocklen = 1;
258257
datatype->desc.desc[0].elem.disp = 0;

opal/datatype/opal_datatype_pack.c

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
113113
dt_stack_t* stack = pConv->pStack;
114114
ptrdiff_t extent = pData->ub - pData->lb;
115115
unsigned char *user_memory, *packed_buffer;
116-
uint32_t idx = 0;
116+
uint32_t idx;
117117
size_t i;
118118

119119
/* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
@@ -125,31 +125,26 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
125125
(void*)pConv->pBaseBuf, *out_size ); );
126126
if( stack[1].type != opal_datatype_uint1.id ) {
127127
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
128-
stack[1].type = opal_datatype_uint1.id;
128+
stack[1].type = opal_datatype_uint1.id;
129129
}
130130
/* We can provide directly the pointers in the user buffers (like the convertor_raw) */
131131
if( NULL == iov[0].iov_base ) {
132132
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
133-
if( stack[1].count != pData->size ) {
133+
for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) {
134134
iov[idx].iov_base = user_memory;
135-
iov[idx].iov_len = stack[1].count;
135+
iov[idx].iov_len = stack[1].count;
136136
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
137-
stack[0].count--; /* update the first stack position */
137+
138+
user_memory += extent;
139+
pConv->bConverted += stack[1].count;
140+
138141
stack[0].disp += extent;
139-
stack[1].count = pData->size; /* for safety */
142+
stack[0].count--;
140143
stack[1].disp = 0;
141-
idx++; /* update next iovec */
144+
stack[1].count = pData->size; /* we might need this to update the partial
145+
* length for the first iteration */
146+
142147
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp;
143-
pConv->bConverted += stack[1].count;
144-
}
145-
for( ; (idx < (*out_size)) && stack[0].count; idx++ ) {
146-
iov[idx].iov_base = user_memory;
147-
iov[idx].iov_len = pData->size;
148-
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
149-
stack[0].count--;
150-
stack[0].disp += extent;
151-
user_memory += extent;
152-
pConv->bConverted += pData->size;
153148
}
154149
goto update_status_and_return;
155150
}

opal/datatype/opal_datatype_pack.h

Lines changed: 42 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -35,82 +35,74 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
3535
size_t* SPACE )
3636
{
3737
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
38-
size_t total_count = _elem->count * _elem->blocklen;
3938
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
4039
size_t do_now, do_now_bytes;
40+
size_t blocklen_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
4141
unsigned char* _memory = (*memory) + _elem->disp;
42+
unsigned char* _packed = *packed;
4243

4344
assert( *(COUNT) <= _elem->count * _elem->blocklen);
4445

4546
if( cando_count > *(COUNT) )
4647
cando_count = *(COUNT);
4748

4849
/**
49-
* First check if we already did something on this element ?
50+
* First check if we already did something on this element ? The COUNT is the number
51+
* of remaining predefined types in the current elem, not how many predefined types
52+
* should be manipulated in the current call (this number is instead reflected on the
53+
* SPACE).
5054
*/
51-
do_now = (total_count - *(COUNT)); /* done elements */
55+
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
56+
/* premptively update the number of COUNT we will return. */
57+
*(COUNT) -= cando_count;
5258
if( 0 != do_now ) {
53-
do_now = do_now % _elem->blocklen; /* partial blocklen? */
54-
55-
if( 0 != do_now ) {
56-
size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */
57-
do_now = (left_in_block > cando_count ) ? cando_count : left_in_block;
58-
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
59-
60-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
61-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
62-
DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n",
63-
(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
64-
MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) );
65-
_memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes;
66-
/* compensate if we just completed a blocklen */
67-
if( do_now == left_in_block )
68-
_memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
69-
*(packed) += do_now_bytes;
70-
*(SPACE) -= do_now_bytes;
71-
*(COUNT) -= do_now;
72-
cando_count -= do_now;
73-
}
59+
size_t left_in_block = do_now; /* left in the current blocklen */
60+
do_now = (do_now > cando_count ) ? cando_count : do_now;
61+
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
62+
63+
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
64+
(CONVERTOR)->pDesc, (CONVERTOR)->count );
65+
DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n",
66+
_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
67+
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
68+
_memory += (ptrdiff_t)do_now_bytes;
69+
/* compensate if we just completed a blocklen */
70+
if( do_now == left_in_block )
71+
_memory += _elem->extent - blocklen_bytes;
72+
_packed += do_now_bytes;
73+
cando_count -= do_now;
7474
}
7575

76-
/**
77-
* Compute how many full blocklen we need to do and do them.
78-
*/
79-
do_now = cando_count / _elem->blocklen;
80-
if( 0 != do_now ) {
81-
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
82-
for(size_t _i = 0; _i < do_now; _i++ ) {
83-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
84-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
85-
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
86-
(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); );
87-
MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) );
88-
*(packed) += do_now_bytes;
89-
_memory += _elem->extent;
90-
*(SPACE) -= do_now_bytes;
91-
*(COUNT) -= _elem->blocklen;
92-
cando_count -= _elem->blocklen;
93-
}
76+
/* Do as many full blocklen as possible */
77+
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
78+
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
79+
(CONVERTOR)->pDesc, (CONVERTOR)->count );
80+
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
81+
(void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
82+
MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
83+
_packed += blocklen_bytes;
84+
_memory += _elem->extent;
85+
cando_count -= _elem->blocklen;
9486
}
9587

9688
/**
9789
* As an epilog do anything left from the last blocklen.
9890
*/
99-
do_now = cando_count;
100-
if( 0 != do_now ) {
101-
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
91+
if( 0 != cando_count ) {
92+
assert( cando_count < _elem->blocklen );
93+
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
10294
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
10395
(CONVERTOR)->pDesc, (CONVERTOR)->count );
10496
DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n",
105-
(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
106-
MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) );
97+
(void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
98+
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
10799
_memory += do_now_bytes;
108-
*(packed) += do_now_bytes;
109-
*(SPACE) -= do_now_bytes;
110-
*(COUNT) -= do_now;
100+
_packed += do_now_bytes;
111101
}
112102

113103
*(memory) = _memory - _elem->disp;
104+
*(SPACE) -= (_packed - *packed);
105+
*(packed) = _packed;
114106
}
115107

116108
static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,

opal/datatype/opal_datatype_position.c

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,21 @@
4949
* - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless.
5050
*/
5151

52+
static inline void
53+
position_single_block(opal_convertor_t* CONVERTOR,
54+
unsigned char** mem, ptrdiff_t mem_update,
55+
size_t* space, size_t space_update,
56+
size_t* cnt, size_t cnt_update)
57+
{
58+
OPAL_DATATYPE_SAFEGUARD_POINTER( *mem, mem_update, (CONVERTOR)->pBaseBuf,
59+
(CONVERTOR)->pDesc, (CONVERTOR)->count );
60+
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n",
61+
(void*)*mem, (unsigned long)space_update, (unsigned long)(*space) ); );
62+
*mem += mem_update;
63+
*space -= space_update;
64+
*cnt -= cnt_update;
65+
}
66+
5267
/**
5368
* Advance the current position in the convertor based using the
5469
* current element and a left-over counter. Update the head pointer
@@ -84,16 +99,12 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
8499
do_now = (left_in_block > cando_count ) ? cando_count : left_in_block;
85100
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
86101

87-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
88-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
89-
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n",
90-
(void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
91-
_memory = *(POINTER) + _elem->disp + (ptrdiff_t)do_now_bytes;
102+
position_single_block( CONVERTOR, &_memory, do_now_bytes,
103+
SPACE, do_now_bytes, COUNT, do_now );
104+
92105
/* compensate if we just completed a blocklen */
93106
if( do_now == left_in_block )
94107
_memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
95-
*(SPACE) -= do_now_bytes;
96-
*(COUNT) -= do_now;
97108
cando_count -= do_now;
98109
}
99110
}
@@ -105,13 +116,8 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
105116
if( 0 != do_now ) {
106117
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
107118
for(size_t _i = 0; _i < do_now; _i++ ) {
108-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
109-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
110-
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu\n",
111-
(void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); );
112-
_memory += _elem->extent;
113-
*(SPACE) -= do_now_bytes;
114-
*(COUNT) -= _elem->blocklen;
119+
position_single_block( CONVERTOR, &_memory, _elem->extent,
120+
SPACE, do_now_bytes, COUNT, _elem->blocklen );
115121
cando_count -= _elem->blocklen;
116122
}
117123
}
@@ -122,13 +128,8 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
122128
do_now = cando_count;
123129
if( 0 != do_now ) {
124130
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
125-
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
126-
(CONVERTOR)->pDesc, (CONVERTOR)->count );
127-
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [epilog]\n",
128-
(void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
129-
_memory += do_now_bytes;
130-
*(SPACE) -= do_now_bytes;
131-
*(COUNT) -= do_now;
131+
position_single_block( CONVERTOR, &_memory, do_now_bytes,
132+
SPACE, do_now_bytes, COUNT, do_now );
132133
}
133134

134135
*(POINTER) = _memory - _elem->disp;

0 commit comments

Comments
 (0)