@@ -153,7 +153,7 @@ int32_t opal_unpack_homogeneous_contig_function(opal_convertor_t *pConv, struct
153
153
}
154
154
}
155
155
}
156
- * out_size = iov_idx ; /* we only reach this line after the for loop succesfully complete */
156
+ * out_size = iov_idx ; /* we only reach this line after the for loop successfully complete */
157
157
* max_data = pConv -> bConverted - initial_bytes_converted ;
158
158
if (pConv -> bConverted == pConv -> local_size ) {
159
159
pConv -> flags |= CONVERTOR_COMPLETED ;
@@ -173,63 +173,71 @@ int32_t opal_unpack_homogeneous_contig_function(opal_convertor_t *pConv, struct
173
173
* change the content of the data (as in all conversions that require changing the size
174
174
* of the exponent or mantissa).
175
175
*/
176
- static inline void opal_unpack_partial_datatype ( opal_convertor_t * pConvertor , dt_elem_desc_t * pElem ,
177
- unsigned char * partial_data ,
178
- ptrdiff_t start_position , size_t length ,
179
- unsigned char * * user_buffer )
176
+ static inline void
177
+ opal_unpack_partial_predefined ( opal_convertor_t * pConvertor , const dt_elem_desc_t * pElem ,
178
+ size_t * COUNT , unsigned char * * packed ,
179
+ unsigned char * * memory , size_t * SPACE )
180
180
{
181
181
char unused_byte = 0x7F , saved_data [16 ];
182
182
unsigned char temporary [16 ], * temporary_buffer = temporary ;
183
- unsigned char * user_data = * user_buffer + pElem -> elem .disp ;
184
- size_t count_desc = 1 ;
183
+ unsigned char * user_data = * memory + pElem -> elem .disp ;
185
184
size_t data_length = opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size ;
185
+ unsigned char * partial_data = * packed ;
186
+ ptrdiff_t start_position = pConvertor -> partial_length ;
187
+ size_t length = data_length - start_position ;
188
+ size_t count_desc = 1 ;
189
+ dt_elem_desc_t single_elem = { .elem = { .common = pElem -> elem .common , .count = 1 , .blocklen = 1 ,
190
+ .extent = data_length , /* advance by a full data element */
191
+ .disp = 0 /* right where the pointer is */ } };
192
+ if ( * SPACE < length ) {
193
+ length = * SPACE ;
194
+ }
186
195
187
- DO_DEBUG (opal_output (0 ,
188
- "unpack partial data start %lu end %lu data_length %lu user %p\n"
189
- "\tbConverted %lu total_length %lu count %ld\n" ,
190
- (unsigned long ) start_position , (unsigned long ) start_position + length ,
191
- (unsigned long ) data_length , (void * ) * user_buffer ,
192
- (unsigned long ) pConvertor -> bConverted ,
193
- (unsigned long ) pConvertor -> local_size , pConvertor -> count ););
194
-
195
- /* Find a byte that is not used in the partial buffer */
196
- find_unused_byte :
197
- for (size_t i = 0 ; i < length ; i ++ ) {
198
- if (unused_byte == partial_data [i ]) {
196
+ DO_DEBUG ( opal_output ( 0 , "unpack partial data start %lu end %lu data_length %lu user %p\n"
197
+ "\tbConverted %lu total_length %lu count %ld\n" ,
198
+ (unsigned long )start_position , (unsigned long )start_position + length ,
199
+ (unsigned long )data_length , (void * )* memory ,
200
+ (unsigned long )pConvertor -> bConverted ,
201
+ (unsigned long )pConvertor -> local_size , pConvertor -> count ); );
202
+ COMPUTE_CSUM ( partial_data , length , pConvertor );
203
+
204
+ /* Find a byte value that is not used in the partial buffer. We use it as a marker
205
+ * to identify what has not been modified by the unpack call. */
206
+ find_unused_byte :
207
+ for (size_t i = 0 ; i < length ; i ++ ) {
208
+ if ( unused_byte == partial_data [i ] ) {
199
209
unused_byte -- ;
200
210
goto find_unused_byte ;
201
211
}
202
212
}
203
213
204
- /* Copy and fill the rest of the buffer with the unused byte */
205
- memset (temporary , unused_byte , data_length );
206
- MEMCPY (temporary + start_position , partial_data , length );
214
+ /* Prepare an full element of the predefined type, by populating an entire type
215
+ * with the unused byte and then put the partial data at the right position. */
216
+ memset ( temporary , unused_byte , data_length );
217
+ MEMCPY ( temporary + start_position , partial_data , length );
207
218
219
+ /* Save the original content of the user memory */
208
220
#if OPAL_CUDA_SUPPORT
209
221
/* In the case where the data is being unpacked from device memory, need to
210
- * use the special host to device memory copy. Note this code path was only
211
- * seen on large receives of noncontiguous data via buffered sends. */
212
- pConvertor -> cbmemcpy (saved_data , user_data , data_length , pConvertor );
222
+ * use the special host to device memory copy. */
223
+ pConvertor -> cbmemcpy (saved_data , user_data , data_length , pConvertor );
213
224
#else
214
- /* Save the content of the user memory */
215
- MEMCPY (saved_data , user_data , data_length );
225
+ MEMCPY ( saved_data , user_data , data_length );
216
226
#endif
217
227
218
228
/* Then unpack the data into the user memory */
219
- UNPACK_PREDEFINED_DATATYPE (pConvertor , pElem , count_desc , temporary_buffer , * user_buffer ,
229
+ UNPACK_PREDEFINED_DATATYPE (pConvertor , & single_elem , count_desc , temporary_buffer , user_data ,
220
230
data_length );
221
231
222
- /* reload the length as it is reset by the macro */
232
+ /* reload the length and user buffer as they have been updated by the macro */
223
233
data_length = opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size ;
234
+ user_data = * memory + pElem -> elem .disp ;
224
235
225
- /* For every occurrence of the unused byte move data from the saved
226
- * buffer back into the user memory.
227
- */
236
+ /* Rebuild the data by pulling back the unmodified bytes from the original
237
+ * content in the user memory. */
228
238
#if OPAL_CUDA_SUPPORT
229
239
/* Need to copy the modified user_data again so we can see which
230
- * bytes need to be converted back to their original values. Note
231
- * this code path was only seen on large receives of noncontiguous
232
- * data via buffered sends. */
240
+ * bytes need to be converted back to their original values. */
233
241
{
234
242
char resaved_data [16 ];
235
243
pConvertor -> cbmemcpy (resaved_data , user_data , data_length , pConvertor );
@@ -245,6 +253,16 @@ static inline void opal_unpack_partial_datatype(opal_convertor_t *pConvertor, dt
245
253
}
246
254
}
247
255
#endif
256
+ pConvertor -> partial_length = (pConvertor -> partial_length + length ) % data_length ;
257
+ * SPACE -= length ;
258
+ * packed += length ;
259
+ if (0 == pConvertor -> partial_length ) {
260
+ (* COUNT )-- ; /* we have enough to complete one full predefined type */
261
+ * memory += data_length ;
262
+ if (0 == (* COUNT % pElem -> elem .blocklen )) {
263
+ * memory += pElem -> elem .extent - (pElem -> elem .blocklen * data_length );
264
+ }
265
+ }
248
266
}
249
267
250
268
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access
@@ -271,9 +289,8 @@ int32_t opal_generic_simple_unpack_function(opal_convertor_t *pConvertor, struct
271
289
size_t iov_len_local ;
272
290
uint32_t iov_count ;
273
291
274
- DO_DEBUG (opal_output (0 , "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n" ,
275
- (void * ) pConvertor , (void * ) iov [0 ].iov_base ,
276
- (unsigned long ) iov [0 ].iov_len , * out_size ););
292
+ DO_DEBUG ( opal_output ( 0 , "opal_convertor_generic_simple_unpack( %p, iov[%u] = {%p, %lu} )\n" ,
293
+ (void * )pConvertor , * out_size , (void * )iov [0 ].iov_base , (unsigned long )iov [0 ].iov_len ); );
277
294
278
295
description = pConvertor -> use_desc -> desc ;
279
296
@@ -300,26 +317,25 @@ int32_t opal_generic_simple_unpack_function(opal_convertor_t *pConvertor, struct
300
317
iov_ptr = (unsigned char * ) iov [iov_count ].iov_base ;
301
318
iov_len_local = iov [iov_count ].iov_len ;
302
319
303
- if (0 != pConvertor -> partial_length ) {
304
- size_t element_length = opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size ;
305
- size_t missing_length = element_length - pConvertor -> partial_length ;
306
-
307
- assert (pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA );
308
- COMPUTE_CSUM (iov_ptr , missing_length , pConvertor );
309
- opal_unpack_partial_datatype (pConvertor , pElem , iov_ptr , pConvertor -> partial_length ,
310
- (size_t )(element_length - pConvertor -> partial_length ),
311
- & conv_ptr );
312
- -- count_desc ;
313
- if (0 == count_desc ) {
314
- conv_ptr = pConvertor -> pBaseBuf + pStack -> disp ;
315
- pos_desc ++ ; /* advance to the next data */
316
- UPDATE_INTERNAL_COUNTERS (description , pos_desc , pElem , count_desc );
317
- }
318
- iov_ptr += missing_length ;
319
- iov_len_local -= missing_length ;
320
- pConvertor -> partial_length = 0 ; /* nothing more inside */
321
- }
320
+ /* Deal with all types of partial predefined datatype unpacking, including when
321
+ * unpacking a partial predefined element and when unpacking a part smaller than
322
+ * the blocklen.
323
+ */
322
324
if (pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) {
325
+ if (0 != pConvertor -> partial_length ) { /* partial predefined element */
326
+ assert ( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA );
327
+ opal_unpack_partial_predefined ( pConvertor , pElem , & count_desc ,
328
+ & iov_ptr , & conv_ptr , & iov_len_local );
329
+ if (0 == count_desc ) { /* the end of the vector ? */
330
+ assert ( 0 == pConvertor -> partial_length );
331
+ conv_ptr = pConvertor -> pBaseBuf + pStack -> disp ;
332
+ pos_desc ++ ; /* advance to the next data */
333
+ UPDATE_INTERNAL_COUNTERS (description , pos_desc , pElem , count_desc );
334
+ goto next_vector ;
335
+ }
336
+ if ( 0 == iov_len_local )
337
+ goto complete_loop ;
338
+ }
323
339
if (((size_t ) pElem -> elem .count * pElem -> elem .blocklen ) != count_desc ) {
324
340
/* we have a partial (less than blocklen) basic datatype */
325
341
int rc = UNPACK_PARTIAL_BLOCKLEN (pConvertor , pElem , count_desc , iov_ptr , conv_ptr ,
@@ -336,6 +352,7 @@ int32_t opal_generic_simple_unpack_function(opal_convertor_t *pConvertor, struct
336
352
}
337
353
338
354
while (1 ) {
355
+ next_vector :
339
356
while (pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) {
340
357
/* we have a basic datatype (working on full blocks) */
341
358
UNPACK_PREDEFINED_DATATYPE (pConvertor , pElem , count_desc , iov_ptr , conv_ptr ,
@@ -401,19 +418,14 @@ int32_t opal_generic_simple_unpack_function(opal_convertor_t *pConvertor, struct
401
418
}
402
419
}
403
420
complete_loop :
404
- assert (pElem -> elem .common .type < OPAL_DATATYPE_MAX_PREDEFINED );
405
- if (( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) && (0 != iov_len_local )) {
406
- unsigned char * temp = conv_ptr ;
421
+ assert ( pElem -> elem .common .type < OPAL_DATATYPE_MAX_PREDEFINED );
422
+ if ( ( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) && (0 != iov_len_local ) ) {
423
+ unsigned char * temp = conv_ptr ;
407
424
/* We have some partial data here. Let's copy it into the convertor
408
425
* and keep it hot until the next round.
409
426
*/
410
- assert (iov_len_local < opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size );
411
- COMPUTE_CSUM (iov_ptr , iov_len_local , pConvertor );
412
-
413
- opal_unpack_partial_datatype (pConvertor , pElem , iov_ptr , 0 , iov_len_local , & temp );
414
-
415
- pConvertor -> partial_length = iov_len_local ;
416
- iov_len_local = 0 ;
427
+ assert ( iov_len_local < opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size );
428
+ opal_unpack_partial_predefined (pConvertor , pElem , & count_desc , & iov_ptr , & temp , & iov_len_local );
417
429
}
418
430
419
431
iov [iov_count ].iov_len -= iov_len_local ; /* update the amount of valid data */
@@ -543,11 +555,6 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec
543
555
unsigned char * conv_ptr , * iov_ptr ;
544
556
uint32_t iov_count ;
545
557
size_t iov_len_local ;
546
- #if 0
547
- const opal_convertor_master_t * master = pConvertor -> master ;
548
- ptrdiff_t advance ; /* number of bytes that we should advance the buffer */
549
- #endif
550
- size_t rc ;
551
558
552
559
DO_DEBUG (opal_output (0 , "opal_convertor_general_unpack( %p, {%p, %lu}, %d )\n" ,
553
560
(void * ) pConvertor , (void * ) iov [0 ].iov_base ,
@@ -609,13 +616,9 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec
609
616
* and keep it hot until the next round.
610
617
*/
611
618
assert (iov_len_local < opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size );
612
- COMPUTE_CSUM (iov_ptr , iov_len_local , pConvertor );
613
-
614
- opal_unpack_partial_datatype (pConvertor , pElem , iov_ptr , 0 , iov_len_local ,
615
- & temp );
616
-
617
- pConvertor -> partial_length = iov_len_local ;
618
- iov_len_local = 0 ;
619
+ opal_unpack_partial_predefined (pConvertor , pElem , & count_desc , & iov_ptr ,
620
+ & temp , & iov_len_local );
621
+ assert ( 0 == iov_len_local );
619
622
}
620
623
goto complete_loop ;
621
624
}
0 commit comments