From f68b06e9ee01d79469d691019e4aca7535ceb4a2 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 9 May 2019 16:27:49 -0400 Subject: [PATCH 01/14] Fix incorrect behavior with length == 0 Fixes #6575. Signed-off-by: George Bosilca --- .../ompi_datatype_create_contiguous.c | 13 ++- ompi/datatype/ompi_datatype_create_darray.c | 4 +- ompi/datatype/ompi_datatype_create_indexed.c | 79 +++++++++---------- ompi/datatype/ompi_datatype_create_struct.c | 38 ++++----- ompi/datatype/ompi_datatype_create_vector.c | 21 ++--- 5 files changed, 68 insertions(+), 87 deletions(-) diff --git a/ompi/datatype/ompi_datatype_create_contiguous.c b/ompi/datatype/ompi_datatype_create_contiguous.c index fb44673ef5c..6a287caa41c 100644 --- a/ompi/datatype/ompi_datatype_create_contiguous.c +++ b/ompi/datatype/ompi_datatype_create_contiguous.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -29,13 +29,12 @@ int32_t ompi_datatype_create_contiguous( int count, const ompi_datatype_t* oldTy { ompi_datatype_t* pdt; - if( 0 == count ) { - pdt = ompi_datatype_create( 0 ); - ompi_datatype_add( pdt, &ompi_mpi_datatype_null.dt, 0, 0, 0 ); - } else { - pdt = ompi_datatype_create( oldType->super.desc.used + 2 ); - opal_datatype_add( &(pdt->super), &(oldType->super), count, 0, (oldType->super.ub - oldType->super.lb) ); + if( (0 == count) || (0 == oldType->super.size) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } + + pdt = ompi_datatype_create( oldType->super.desc.used + 2 ); + opal_datatype_add( &(pdt->super), &(oldType->super), count, 0, (oldType->super.ub - oldType->super.lb) ); *newType = pdt; return OMPI_SUCCESS; } diff --git a/ompi/datatype/ompi_datatype_create_darray.c b/ompi/datatype/ompi_datatype_create_darray.c index a245dcebce4..e0292755c4b 100644 --- a/ompi/datatype/ompi_datatype_create_darray.c +++ b/ompi/datatype/ompi_datatype_create_darray.c @@ -192,9 +192,7 @@ int32_t ompi_datatype_create_darray(int size, if (ndims < 1) { /* Don't just return MPI_DATATYPE_NULL as that can't be MPI_TYPE_FREE()ed, and that seems bad */ - *newtype = ompi_datatype_create(0); - ompi_datatype_add(*newtype, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return MPI_SUCCESS; + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newtype); } rc = ompi_datatype_type_extent(oldtype, &orig_extent); diff --git a/ompi/datatype/ompi_datatype_create_indexed.c b/ompi/datatype/ompi_datatype_create_indexed.c index 457efb1e6ff..e72b41afc7d 100644 --- a/ompi/datatype/ompi_datatype_create_indexed.c +++ b/ompi/datatype/ompi_datatype_create_indexed.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -34,24 +34,28 @@ int32_t ompi_datatype_create_indexed( int count, const int* pBlockLength, const int* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - if( 0 == count ) { + /* ignore all cases that lead to an empty type */ + ompi_datatype_type_size(oldType, &dLength); + for( i = 0; (i < count) && (0 == pBlockLength[i]); i++ ); /* find first non zero */ + if( (i == count) || (0 == dLength) ) { return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } - disp = pDisp[0]; - dLength = pBlockLength[0]; + disp = pDisp[i]; + dLength = pBlockLength[i]; endat = disp + dLength; ompi_datatype_type_extent( oldType, &extent ); - pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); - for( i = 1; i < count; i++ ) { - if( endat == pDisp[i] ) { - /* contiguous with the previsious */ + pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) ); + for( i += 1; i < count; i++ ) { + if( 0 == pBlockLength[i] ) /* ignore empty length */ + continue; + if( endat == pDisp[i] ) { /* contiguous with the previsious */ dLength += pBlockLength[i]; endat += pBlockLength[i]; } else { @@ -71,26 +75,28 @@ int32_t ompi_datatype_create_indexed( int count, const int* pBlockLength, const int32_t ompi_datatype_create_hindexed( int count, const int* pBlockLength, const ptrdiff_t* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + /* ignore all cases that lead to an empty type */ + ompi_datatype_type_size(oldType, &dLength); + for( i = 0; (i < count) && (0 == pBlockLength[i]); i++ ); /* find first non zero */ + if( (i == count) || (0 == dLength) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } - ompi_datatype_type_extent( oldType, &extent ); - pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); - disp = pDisp[0]; - dLength = pBlockLength[0]; + disp = pDisp[i]; + dLength = pBlockLength[i]; endat = disp + dLength * extent; + ompi_datatype_type_extent( oldType, &extent ); - for( i = 1; i < count; i++ ) { - if( endat == pDisp[i] ) { - /* contiguous with the previsious */ + pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) ); + for( i += 1; i < count; i++ ) { + if( 0 == pBlockLength[i] ) /* ignore empty length */ + continue; + if( endat == pDisp[i] ) { /* contiguous with the previsious */ dLength += pBlockLength[i]; endat += pBlockLength[i] * extent; } else { @@ -110,21 +116,15 @@ int32_t ompi_datatype_create_hindexed( int count, const int* pBlockLength, const int32_t ompi_datatype_create_indexed_block( int count, int bLength, const int* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - ompi_datatype_type_extent( oldType, &extent ); if( (count == 0) || (bLength == 0) ) { - if( 0 == count ) { - return ompi_datatype_duplicate(&ompi_mpi_datatype_null.dt, newType); - } else { - *newType = ompi_datatype_create(1); - ompi_datatype_add( *newType, oldType, 0, pDisp[0] * extent, extent ); - return OMPI_SUCCESS; - } + return ompi_datatype_duplicate(&ompi_mpi_datatype_null.dt, newType); } + ompi_datatype_type_extent( oldType, &extent ); pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); disp = pDisp[0]; dLength = bLength; @@ -150,20 +150,15 @@ int32_t ompi_datatype_create_indexed_block( int count, int bLength, const int* p int32_t ompi_datatype_create_hindexed_block( int count, int bLength, const ptrdiff_t* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - ompi_datatype_type_extent( oldType, &extent ); if( (count == 0) || (bLength == 0) ) { - *newType = ompi_datatype_create(1); - if( 0 == count ) - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0 ); - else - ompi_datatype_add( *newType, oldType, 0, pDisp[0] * extent, extent ); - return OMPI_SUCCESS; + return ompi_datatype_duplicate(&ompi_mpi_datatype_null.dt, newType); } + ompi_datatype_type_extent( oldType, &extent ); pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); disp = pDisp[0]; dLength = bLength; diff --git a/ompi/datatype/ompi_datatype_create_struct.c b/ompi/datatype/ompi_datatype_create_struct.c index 98daa8bacbb..9c78f53fee3 100644 --- a/ompi/datatype/ompi_datatype_create_struct.c +++ b/ompi/datatype/ompi_datatype_create_struct.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -31,27 +31,27 @@ int32_t ompi_datatype_create_struct( int count, const int* pBlockLength, const ptrdiff_t* pDisp, ompi_datatype_t* const * pTypes, ompi_datatype_t** newType ) { - int i; ptrdiff_t disp = 0, endto, lastExtent, lastDisp; - int lastBlock; ompi_datatype_t *pdt, *lastType; + int lastBlock; + int i, start_from; - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + /* Find first non-zero length element */ + for( i = 0; (i < count) && (0 == pBlockLength[i]); i++ ); + if( i == count ) { /* either nothing or nothing relevant */ + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } - - /* if we compute the total number of elements before we can + /* compute the total number of elements before we can * avoid increasing the size of the desc array often. */ - lastType = (ompi_datatype_t*)pTypes[0]; - lastBlock = pBlockLength[0]; + start_from = i; + lastType = (ompi_datatype_t*)pTypes[start_from]; + lastBlock = pBlockLength[start_from]; lastExtent = lastType->super.ub - lastType->super.lb; - lastDisp = pDisp[0]; - endto = pDisp[0] + lastExtent * lastBlock; + lastDisp = pDisp[start_from]; + endto = pDisp[start_from] + lastExtent * lastBlock; - for( i = 1; i < count; i++ ) { + for( i = (start_from + 1); i < count; i++ ) { if( (pTypes[i] == lastType) && (pDisp[i] == endto) ) { lastBlock += pBlockLength[i]; endto = lastDisp + lastBlock * lastExtent; @@ -68,16 +68,16 @@ int32_t ompi_datatype_create_struct( int count, const int* pBlockLength, const p disp += lastType->super.desc.used; if( lastBlock != 1 ) disp += 2; - lastType = (ompi_datatype_t*)pTypes[0]; - lastBlock = pBlockLength[0]; + lastType = (ompi_datatype_t*)pTypes[start_from]; + lastBlock = pBlockLength[start_from]; lastExtent = lastType->super.ub - lastType->super.lb; - lastDisp = pDisp[0]; - endto = pDisp[0] + lastExtent * lastBlock; + lastDisp = pDisp[start_from]; + endto = pDisp[start_from] + lastExtent * lastBlock; pdt = ompi_datatype_create( (int32_t)disp ); /* Do again the same loop but now add the elements */ - for( i = 1; i < count; i++ ) { + for( i = (start_from + 1); i < count; i++ ) { if( (pTypes[i] == lastType) && (pDisp[i] == endto) ) { lastBlock += pBlockLength[i]; endto = lastDisp + lastBlock * lastExtent; diff --git a/ompi/datatype/ompi_datatype_create_vector.c b/ompi/datatype/ompi_datatype_create_vector.c index 1de8df4d2d2..c4829a4b54c 100644 --- a/ompi/datatype/ompi_datatype_create_vector.c +++ b/ompi/datatype/ompi_datatype_create_vector.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -28,23 +28,14 @@ #include "ompi/datatype/ompi_datatype.h" -/* Open questions ... - * - how to improuve the handling of these vectors (creating a temporary datatype - * can be ONLY a initial solution. - * - */ - int32_t ompi_datatype_create_vector( int count, int bLength, int stride, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { ompi_datatype_t *pTempData, *pData; ptrdiff_t extent = oldType->super.ub - oldType->super.lb; - - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + if( (0 == count) || (0 == bLength) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } pData = ompi_datatype_create( oldType->super.desc.used + 2 ); @@ -72,10 +63,8 @@ int32_t ompi_datatype_create_hvector( int count, int bLength, ptrdiff_t stride, ompi_datatype_t *pTempData, *pData; ptrdiff_t extent = oldType->super.ub - oldType->super.lb; - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + if( (0 == count) || (0 == bLength) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } pTempData = ompi_datatype_create( oldType->super.desc.used + 2 ); From 4f754d01562340f66d4eee40913dca3786a38909 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 15 May 2019 23:41:22 -0400 Subject: [PATCH 02/14] Optimized datatype description. Move toward a base type of vector (count, type, blocklen, extent, disp) with disp and extent applying toward the count repertition and blocklen being a contiguous memory of type type. Implement 2 optimizations on this description used during type_commit: - collapse: successive similar datatype descriptions are collapsed together with an increased count. - fusion: fuse successive datatype descriptions in order to minimize the number of resulting memcpy during pack/unpack. Fixes at the OMPI datatype level including: - Fix the create_hindexed and vector creation. - Fix the handling of [get|set]_elements and _count. - Correctly compute the dispacement for block indexed types. - Support the MPI_LB and MPI_UB deprecation, aka. OMPI_ENABLE_MPI1_COMPAT. Signed-off-by: George Bosilca --- ompi/datatype/ompi_datatype.h | 2 +- ompi/datatype/ompi_datatype_create_indexed.c | 8 +- ompi/datatype/ompi_datatype_external.c | 3 +- opal/datatype/opal_convertor.c | 17 +- opal/datatype/opal_datatype.h | 40 ++- opal/datatype/opal_datatype_add.c | 55 +++- opal/datatype/opal_datatype_copy.h | 63 ++-- opal/datatype/opal_datatype_get_count.c | 10 +- opal/datatype/opal_datatype_internal.h | 26 +- opal/datatype/opal_datatype_module.c | 1 + opal/datatype/opal_datatype_monotonic.c | 31 +- opal/datatype/opal_datatype_optimize.c | 287 +++++++++---------- opal/datatype/opal_datatype_pack.h | 144 ++++++---- opal/datatype/opal_datatype_position.c | 85 +++++- opal/datatype/opal_datatype_unpack.h | 145 ++++++---- 15 files changed, 548 insertions(+), 369 deletions(-) diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h index 8b48bc30973..f589c874b64 100644 --- a/ompi/datatype/ompi_datatype.h +++ b/ompi/datatype/ompi_datatype.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2009-2013 The University of Tennessee and The University + * Copyright (c) 2009-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. diff --git a/ompi/datatype/ompi_datatype_create_indexed.c b/ompi/datatype/ompi_datatype_create_indexed.c index e72b41afc7d..2684d9d7df0 100644 --- a/ompi/datatype/ompi_datatype_create_indexed.c +++ b/ompi/datatype/ompi_datatype_create_indexed.c @@ -87,10 +87,10 @@ int32_t ompi_datatype_create_hindexed( int count, const int* pBlockLength, const return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } + ompi_datatype_type_extent( oldType, &extent ); disp = pDisp[i]; dLength = pBlockLength[i]; endat = disp + dLength * extent; - ompi_datatype_type_extent( oldType, &extent ); pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) ); for( i += 1; i < count; i++ ) { @@ -162,17 +162,17 @@ int32_t ompi_datatype_create_hindexed_block( int count, int bLength, const ptrdi pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); disp = pDisp[0]; dLength = bLength; - endat = disp + dLength; + endat = disp + dLength * extent; for( i = 1; i < count; i++ ) { if( endat == pDisp[i] ) { /* contiguous with the previsious */ dLength += bLength; - endat += bLength; + endat += bLength * extent; } else { ompi_datatype_add( pdt, oldType, dLength, disp, extent ); disp = pDisp[i]; dLength = bLength; - endat = disp + bLength; + endat = disp + bLength * extent; } } ompi_datatype_add( pdt, oldType, dLength, disp, extent ); diff --git a/ompi/datatype/ompi_datatype_external.c b/ompi/datatype/ompi_datatype_external.c index d47531ef29e..53b907218cf 100644 --- a/ompi/datatype/ompi_datatype_external.c +++ b/ompi/datatype/ompi_datatype_external.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, @@ -26,7 +26,6 @@ #include #include "ompi/runtime/params.h" -#include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" #include "opal/datatype/opal_convertor.h" diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index ce889f7e959..631d3adab43 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -324,8 +324,9 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv, return pConv->fAdvance( pConv, iov, out_size, max_data ); } -static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor, - size_t starting_point, const size_t* sizes ) +static inline int +opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor, + size_t starting_point, const size_t* sizes ) { dt_stack_t* pStack; /* pointer to the position on the stack */ const opal_datatype_t* pData = pConvertor->pDesc; @@ -349,7 +350,7 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pStack[0].disp = count * extent; /* now compute the number of pending bytes */ - count = starting_point - count * pData->size; + count = starting_point % pData->size; /** * We save the current displacement starting from the begining * of this data. @@ -370,9 +371,9 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* return OPAL_SUCCESS; } -static inline -int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, - const size_t* sizes ) +static inline int +opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, + const size_t* sizes ) { dt_stack_t* pStack = convertor->pStack; dt_elem_desc_t* pElems; @@ -402,7 +403,7 @@ int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, pStack[1].count = pElems[0].loop.loops; pStack[1].type = OPAL_DATATYPE_LOOP; } else { - pStack[1].count = pElems[0].elem.count; + pStack[1].count = pElems[0].elem.count * pElems[0].elem.blocklen; pStack[1].type = pElems[0].elem.common.type; } return OPAL_SUCCESS; diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index a836a5aae03..e1bc18c67f9 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -224,13 +224,41 @@ opal_datatype_is_contiguous_memory_layout( const opal_datatype_t* datatype, int3 } -OPAL_DECLSPEC void opal_datatype_dump( const opal_datatype_t* pData ); +OPAL_DECLSPEC void +opal_datatype_dump( const opal_datatype_t* pData ); + /* data creation functions */ -OPAL_DECLSPEC int32_t opal_datatype_clone( const opal_datatype_t * src_type, opal_datatype_t * dest_type ); -OPAL_DECLSPEC int32_t opal_datatype_create_contiguous( int count, const opal_datatype_t* oldType, opal_datatype_t** newType ); -OPAL_DECLSPEC int32_t opal_datatype_resize( opal_datatype_t* type, ptrdiff_t lb, ptrdiff_t extent ); -OPAL_DECLSPEC int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtAdd, size_t count, - ptrdiff_t disp, ptrdiff_t extent ); + +/** + * Create a duplicate of the source datatype. + */ +OPAL_DECLSPEC int32_t +opal_datatype_clone( const opal_datatype_t* src_type, + opal_datatype_t* dest_type ); +/** + * A contiguous array of identical datatypes. + */ +OPAL_DECLSPEC int32_t +opal_datatype_create_contiguous( int count, const opal_datatype_t* oldType, + opal_datatype_t** newType ); +/** + * Add a new datatype to the base type description. The count is the number + * repetitions of the same element to be added, and the extent is the extent + * of each element. The displacement is the initial displacement of the + * first element. + */ +OPAL_DECLSPEC int32_t +opal_datatype_add( opal_datatype_t* pdtBase, + const opal_datatype_t* pdtAdd, size_t count, + ptrdiff_t disp, ptrdiff_t extent ); + +/** + * Alter the lb and extent of an existing datatype in place. + */ +OPAL_DECLSPEC int32_t +opal_datatype_resize( opal_datatype_t* type, + ptrdiff_t lb, + ptrdiff_t extent ); static inline int32_t opal_datatype_type_lb( const opal_datatype_t* pData, ptrdiff_t* disp ) diff --git a/opal/datatype/opal_datatype_add.c b/opal/datatype/opal_datatype_add.c index 146ce12afe2..108b4e3d1be 100644 --- a/opal/datatype/opal_datatype_add.c +++ b/opal/datatype/opal_datatype_add.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -281,15 +281,23 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA if( (pdtAdd->flags & (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA)) == (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA) ) { if( NULL != pdtBase->ptypes ) pdtBase->ptypes[pdtAdd->id] += count; + + pLast->elem.common.flags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED); pLast->elem.common.type = pdtAdd->id; - pLast->elem.count = count; pLast->elem.disp = disp; - pLast->elem.extent = extent; - pdtBase->desc.used++; - pLast->elem.common.flags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED); - if( (extent != (ptrdiff_t)pdtAdd->size) && (count > 1) ) { /* gaps around the datatype */ - pLast->elem.common.flags &= ~(OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS); + pLast->elem.extent = count * extent; + /* assume predefined datatypes without extent, aka. contiguous */ + pLast->elem.count = 1; + pLast->elem.blocklen = count; + if( extent != (ptrdiff_t)pdtAdd->size ) { /* not contiguous: let's fix */ + pLast->elem.count = count; + pLast->elem.blocklen = 1; + pLast->elem.extent = extent; + if( count > 1 ) { /* gaps around the predefined datatype */ + pLast->elem.common.flags &= ~(OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS); + } } + pdtBase->desc.used++; } else { /* keep trace of the total number of basic datatypes in the datatype definition */ pdtBase->loops += pdtAdd->loops; @@ -299,13 +307,40 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) if( pdtAdd->ptypes[i] != 0 ) pdtBase->ptypes[i] += (count * pdtAdd->ptypes[i]); } - if( (1 == pdtAdd->desc.used) && (extent == (pdtAdd->ub - pdtAdd->lb)) && - (extent == pdtAdd->desc.desc[0].elem.extent) ){ + if( 1 == pdtAdd->desc.used ) { pLast->elem = pdtAdd->desc.desc[0].elem; - pLast->elem.count *= count; pLast->elem.disp += disp; + if( 1 == count ) { + /* Extent only has a meaning when there are multiple elements. Bail out */ + } else if( 1 == pLast->elem.count ) { + /* The size and true_extent of the added datatype are identical, signaling a datatype + * that is mostly contiguous with the exception of the initial and final gaps. These + * gaps do not matter here as they will amended (the initial gaps being shifted by the + * new displacement and the final gap being replaced with the new gap + */ + if( pdtAdd->desc.desc[0].elem.extent == extent ) { + /* pure bliss everything is fully contiguous and we can collapse + * everything by updating the blocklen and extent + */ + pLast->elem.blocklen *= count; + pLast->elem.extent *= count; + } else { + pLast->elem.count = count; + pLast->elem.extent = extent; + } + } else if( extent == (ptrdiff_t)(pLast->elem.count * pLast->elem.extent) ) { + /* It's just a repetition of the same element, increase the count */ + pLast->elem.count *= count; + } else { + /* No luck here, no optimization can be applied. Fall back to the + * normal case where we add a loop around the datatype. + */ + goto build_loop; + } pdtBase->desc.used++; } else { + +build_loop: /* if the extent of the datatype is the same as the extent of the loop * description of the datatype then we simply have to update the main loop. */ diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h index 7aeac8e63ec..40f119a684d 100644 --- a/opal/datatype/opal_datatype_copy.h +++ b/opal/datatype/opal_datatype_copy.h @@ -48,37 +48,37 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, unsigned char* DESTINATION, size_t* SPACE ) { - size_t _copy_count = (COUNT); - size_t _copy_blength; const ddt_elem_desc_t* _elem = &((ELEM)->elem); unsigned char* _source = (SOURCE) + _elem->disp; unsigned char* _destination = (DESTINATION) + _elem->disp; + size_t total_count = _elem->count * _elem->blocklen; + size_t do_now, do_now_bytes; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; + assert( (COUNT) == total_count); + assert( total_count <= ((*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size) ); - if( _copy_blength == (size_t)_elem->extent ) { - _copy_blength *= _copy_count; - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - /* the extent and the size of the basic datatype are equals */ - DO_DEBUG( opal_output( 0, "copy 1. %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, _copy_blength, *(SPACE) ); ); - MEM_OP( _destination, _source, _copy_blength ); - _source += _copy_blength; - _destination += _copy_blength; - } else { - for(size_t _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy 2. %s( %p, %p, %lu ) => space %lu\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEM_OP( _destination, _source, _copy_blength ); - _source += _elem->extent; + /* We don't a prologue and epilogue here as we are __always__ working + * with full copies of the data description. + */ + + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = _elem->count; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", + STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) ); ); + MEM_OP( _destination, _source, do_now_bytes ); _destination += _elem->extent; + _source += _elem->extent; + *(SPACE) -= do_now_bytes; } - _copy_blength *= _copy_count; + (COUNT) -= total_count; } - *(SPACE) -= _copy_blength; } static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, @@ -147,12 +147,10 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i if( (ptrdiff_t)datatype->size == extent ) { /* all contiguous == no gaps around */ size_t total_length = iov_len_local; size_t memop_chunk = opal_datatype_memop_block_size; + OPAL_DATATYPE_SAFEGUARD_POINTER( source, iov_len_local, + (unsigned char*)source_base, datatype, count ); while( total_length > 0 ) { if( memop_chunk > total_length ) memop_chunk = total_length; - OPAL_DATATYPE_SAFEGUARD_POINTER( destination, memop_chunk, - (unsigned char*)destination_base, datatype, count ); - OPAL_DATATYPE_SAFEGUARD_POINTER( source, memop_chunk, - (unsigned char*)source_base, datatype, count ); DO_DEBUG( opal_output( 0, "copy c1. %s( %p, %p, %lu ) => space %lu\n", STRINGIFY(MEM_OP_NAME), (void*)destination, (void*)source, (unsigned long)memop_chunk, (unsigned long)total_length ); ); MEM_OP( destination, source, memop_chunk ); @@ -184,17 +182,12 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i pos_desc = 0; stack_pos = 0; - if( datatype->opt_desc.desc != NULL ) { - description = datatype->opt_desc.desc; - } else { + description = datatype->opt_desc.desc; + if( NULL == description ) { description = datatype->desc.desc; } - if( description[0].elem.common.type == OPAL_DATATYPE_LOOP ) - count_desc = description[0].loop.loops; - else - count_desc = description[0].elem.count; - pElem = &(description[pos_desc]); + UPDATE_INTERNAL_COUNTERS( description, 0, pElem, count_desc ); while( 1 ) { while( OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) ) { diff --git a/opal/datatype/opal_datatype_get_count.c b/opal/datatype/opal_datatype_get_count.c index ae085c42704..f75b86d0e2d 100644 --- a/opal/datatype/opal_datatype_get_count.c +++ b/opal/datatype/opal_datatype_get_count.c @@ -69,14 +69,14 @@ ssize_t opal_datatype_get_element_count( const opal_datatype_t* datatype, size_t while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); - local_size = pElems[pos_desc].elem.count * basic_type->size; + local_size = (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen) * basic_type->size; if( local_size >= iSize ) { local_size = iSize / basic_type->size; nbElems += (int32_t)local_size; iSize -= local_size * basic_type->size; return (iSize == 0 ? nbElems : -1); } - nbElems += pElems[pos_desc].elem.count; + nbElems += (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen); iSize -= local_size; pos_desc++; /* advance to the next data */ } @@ -131,7 +131,7 @@ int32_t opal_datatype_set_element_count( const opal_datatype_t* datatype, size_t while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); - local_length = pElems[pos_desc].elem.count; + local_length = (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen); if( local_length >= count ) { *length += count * basic_type->size; return 0; @@ -188,8 +188,8 @@ int opal_datatype_compute_ptypes( opal_datatype_t* datatype ) } while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ - datatype->ptypes[pElems[pos_desc].elem.common.type] += pElems[pos_desc].elem.count; - nbElems += pElems[pos_desc].elem.count; + datatype->ptypes[pElems[pos_desc].elem.common.type] += pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen; + nbElems += pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen; DUMP( " compute_ptypes-add: type %d count %"PRIsize_t" (total type %"PRIsize_t" total %lld)\n", pElems[pos_desc].elem.common.type, datatype->ptypes[pElems[pos_desc].elem.common.type], diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index bc3f8aa7cab..2b2ddc0961e 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -220,14 +220,14 @@ union dt_elem_desc { * elem.blocklen to create it. If the number is prime then create a second * element to account for the difference. */ -#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ +#define CREATE_ELEM(_place, _type, _flags, _blocklen, _count, _disp, _extent) \ do { \ (_place)->elem.common.flags = (_flags) | OPAL_DATATYPE_FLAG_DATA; \ (_place)->elem.common.type = (_type); \ - (_place)->elem.disp = (_disp); \ - (_place)->elem.extent = (_extent); \ + (_place)->elem.blocklen = (_blocklen); \ (_place)->elem.count = (_count); \ - (_place)->elem.blocklen = 1; \ + (_place)->elem.extent = (_extent); \ + (_place)->elem.disp = (_disp); \ } while(0) /* * This array holds the descriptions desc.desc[2] of the predefined basic datatypes. @@ -480,22 +480,22 @@ static inline int GET_FIRST_NON_LOOP( const union dt_elem_desc* _pElem ) } #define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \ - do { \ - (ELEMENT) = &((DESCRIPTION)[(POSITION)]); \ - if( OPAL_DATATYPE_LOOP == (ELEMENT)->elem.common.type ) \ - (COUNTER) = (ELEMENT)->loop.loops; \ - else \ - (COUNTER) = (ELEMENT)->elem.count; \ + do { \ + (ELEMENT) = &((DESCRIPTION)[(POSITION)]); \ + if( OPAL_DATATYPE_LOOP == (ELEMENT)->elem.common.type ) \ + (COUNTER) = (ELEMENT)->loop.loops; \ + else \ + (COUNTER) = (ELEMENT)->elem.count * (ELEMENT)->elem.blocklen; \ } while (0) OPAL_DECLSPEC int opal_datatype_contain_basic_datatypes( const struct opal_datatype_t* pData, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_desc( union dt_elem_desc* pDesc, int nbElems, char* ptr, size_t length ); -#if OPAL_ENABLE_DEBUG extern bool opal_position_debug; extern bool opal_copy_debug; -#endif /* OPAL_ENABLE_DEBUG */ +extern bool opal_unpack_debug; +extern bool opal_pack_debug; END_C_DECLS #endif /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index 2d8dedc94e7..7976392b63e 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -226,6 +226,7 @@ int32_t opal_datatype_init( void ) datatype->desc.desc[0].elem.common.type = i; /* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */ datatype->desc.desc[0].elem.count = 1; + datatype->desc.desc[0].elem.blocklen = 1; datatype->desc.desc[0].elem.disp = 0; datatype->desc.desc[0].elem.extent = datatype->size; diff --git a/opal/datatype/opal_datatype_monotonic.c b/opal/datatype/opal_datatype_monotonic.c index b467d95ecbe..247fd66142d 100644 --- a/opal/datatype/opal_datatype_monotonic.c +++ b/opal/datatype/opal_datatype_monotonic.c @@ -2,6 +2,9 @@ /* * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2018-2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,35 +21,43 @@ #include "opal/datatype/opal_datatype_internal.h" #include "opal/datatype/opal_convertor.h" +#define OPAL_DATATYPE_MAX_MONOTONIC_IOVEC 32 + +/** + * Check if the datatype describes a memory layout where the pointers to + * the contiguous pieces are always advancing in the same direction, i.e. + * there is no potential for overlap. + */ int32_t opal_datatype_is_monotonic(opal_datatype_t* type ) { + struct iovec iov[OPAL_DATATYPE_MAX_MONOTONIC_IOVEC]; + ptrdiff_t upper_limit = (ptrdiff_t)type->true_lb; /* as conversion base will be NULL the first address is true_lb */ + size_t max_data = 0x7FFFFFFF; opal_convertor_t *pConv; + bool monotonic = true; uint32_t iov_count; - struct iovec iov[5]; - size_t max_data = 0; - long prev = -1; int rc; - bool monotonic = true; pConv = opal_convertor_create( opal_local_arch, 0 ); if (OPAL_UNLIKELY(NULL == pConv)) { - return 0; + return -1; } rc = opal_convertor_prepare_for_send( pConv, type, 1, NULL ); if( OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(pConv); - return 0; + return -1; } do { - iov_count = 5; + iov_count = OPAL_DATATYPE_MAX_MONOTONIC_IOVEC; rc = opal_convertor_raw( pConv, iov, &iov_count, &max_data); - for (uint32_t i=0; icommon.flags = OPAL_DATATYPE_FLAG_BASIC; \ - _elem->common.type = OPAL_DATATYPE_LOOP; \ - _elem->count = 0; \ - _elem->disp = 0; \ - _elem->extent = 0; \ - } while (0) - static int32_t opal_datatype_optimize_short( opal_datatype_t* pData, size_t count, dt_type_desc_t* pTypeDesc ) { dt_elem_desc_t* pElemDesc; - ddt_elem_desc_t opt_elem; - dt_stack_t* pOrigStack; - dt_stack_t* pStack; /* pointer to the position on the stack */ - int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ - int32_t stack_pos = 0, last_type = OPAL_DATATYPE_UINT1; - int32_t type = OPAL_DATATYPE_LOOP, nbElems = 0, continuity; - ptrdiff_t total_disp = 0, last_extent = 1, last_disp = 0; - uint16_t last_flags = 0xFFFF; /* keep all for the first datatype */ - uint32_t i; - size_t last_length = 0; + dt_stack_t *pOrigStack, *pStack; /* pointer to the position on the stack */ + int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ + int32_t stack_pos = 0; + int32_t nbElems = 0; + ptrdiff_t total_disp = 0; + ddt_elem_desc_t last = {.common.flags = 0xFFFF /* all on */, .count = 0, .disp = 0}, compress; + ddt_elem_desc_t* current; pOrigStack = pStack = (dt_stack_t*)malloc( sizeof(dt_stack_t) * (pData->loops+2) ); SAVE_STACK( pStack, -1, 0, count, 0 ); @@ -64,22 +51,17 @@ opal_datatype_optimize_short( opal_datatype_t* pData, pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length ); pTypeDesc->used = 0; - SET_EMPTY_ELEMENT( &opt_elem ); assert( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pData->desc.used].elem.common.type ); - opt_elem.common.type = OPAL_DATATYPE_LOOP; - opt_elem.common.flags = 0xFFFF; /* keep all for the first datatype */ - opt_elem.count = 0; - opt_elem.disp = pData->desc.desc[pData->desc.used].end_loop.first_elem_disp; - opt_elem.extent = 0; while( stack_pos >= 0 ) { if( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */ ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop); - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); + if( 0 != last.count ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; - last_disp += last_length; - last_length = 0; + last.disp += last.count; + last.count= 0; } CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */ end_loop->first_elem_disp, end_loop->size, end_loop->common.flags ); @@ -97,153 +79,146 @@ opal_datatype_optimize_short( opal_datatype_t* pData, ddt_loop_desc_t* loop = (ddt_loop_desc_t*)&(pData->desc.desc[pos_desc]); ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]); int index = GET_FIRST_NON_LOOP( &(pData->desc.desc[pos_desc]) ); - ptrdiff_t loop_disp = pData->desc.desc[pos_desc + index].elem.disp; - continuity = ((last_disp + (ptrdiff_t)last_length * (ptrdiff_t)opal_datatype_basicDatatypes[last_type]->size) - == (total_disp + loop_disp)); if( loop->common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - /* the loop is contiguous or composed by contiguous elements with a gap */ - if( loop->extent == (ptrdiff_t)end_loop->size ) { - /* the whole loop is contiguous */ - if( !continuity ) { - if( 0 != last_length ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, - last_length, last_disp, last_extent ); - pElemDesc++; nbElems++; - last_length = 0; - } - last_disp = total_disp + loop_disp; - } - last_length = (last_length * opal_datatype_basicDatatypes[last_type]->size - + loop->loops * end_loop->size); - last_type = OPAL_DATATYPE_UINT1; - last_extent = 1; - } else { - int counter = loop->loops; - ptrdiff_t merged_disp = 0; - /* if the previous data is contiguous with this piece and it has a length not ZERO */ - if( last_length != 0 ) { - if( continuity ) { - last_length *= opal_datatype_basicDatatypes[last_type]->size; - last_length += end_loop->size; - last_type = OPAL_DATATYPE_UINT1; - last_extent = 1; - counter--; - merged_disp = loop->extent; /* merged loop, update the disp of the remaining elems */ - } - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, - last_length, last_disp, last_extent ); - pElemDesc++; nbElems++; - last_disp += last_length; - last_length = 0; - last_type = OPAL_DATATYPE_LOOP; - } - /** - * The content of the loop is contiguous (maybe with a gap before or after). - * - * If any of the loops have been merged with the previous element, then the - * displacement of the first element (or the displacement of all elements if the - * loop will be removed) must be updated accordingly. - */ - if( counter <= 2 ) { - merged_disp += end_loop->first_elem_disp; - while( counter > 0 ) { - CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC, - end_loop->size, merged_disp, 1); - pElemDesc++; nbElems++; counter--; - merged_disp += loop->extent; - } - } else { - CREATE_LOOP_START( pElemDesc, counter, 2, loop->extent, loop->common.flags ); - pElemDesc++; nbElems++; - CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC, - end_loop->size, loop_disp, 1); - pElemDesc++; nbElems++; - CREATE_LOOP_END( pElemDesc, 2, end_loop->first_elem_disp + merged_disp, - end_loop->size, end_loop->common.flags ); - pElemDesc++; nbElems++; + assert(pData->desc.desc[pos_desc + index].elem.disp == end_loop->first_elem_disp); + compress.common.flags = loop->common.flags; + compress.common.type = pData->desc.desc[pos_desc + index].elem.common.type; + compress.blocklen = pData->desc.desc[pos_desc + index].elem.blocklen; + for( uint32_t i = index+1; i < loop->items; i++ ) { + current = &pData->desc.desc[pos_desc + i].elem; + assert(1 == current->count); + if( (current->common.type == OPAL_DATATYPE_LOOP) || + compress.common.type != current->common.type ) { + compress.common.type = OPAL_DATATYPE_UINT1; + compress.blocklen = end_loop->size; + break; } + compress.blocklen += current->blocklen; } + compress.count = loop->loops; + compress.extent = loop->extent; + compress.disp = end_loop->first_elem_disp; + + /** + * The current loop has been compressed and can now be treated as if it + * was a data element. We can now look if it can be fused with last, + * as done in the fusion of 2 elements below. Let's use the same code. + */ pos_desc += loop->items + 1; - } else { - ddt_elem_desc_t* elem = (ddt_elem_desc_t*)&(pData->desc.desc[pos_desc+1]); - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); - pElemDesc++; nbElems++; - last_disp += last_length; - last_length = 0; - last_type = OPAL_DATATYPE_LOOP; - } - if( 2 == loop->items ) { /* small loop */ - if( (1 == elem->count) - && (elem->extent == (ptrdiff_t)opal_datatype_basicDatatypes[elem->common.type]->size) ) { - CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags & ~OPAL_DATATYPE_FLAG_CONTIGUOUS, - loop->loops, elem->disp, loop->extent ); + current = &compress; + goto fuse_loops; + } + + /** + * If the content of the loop is not contiguous there is little we can do + * that would not incur significant optimization cost and still be beneficial + * in reducing the number of memcpy during pack/unpack. + */ + + if( 0 != last.count ) { /* Generate the pending element */ + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); + pElemDesc++; nbElems++; + last.count = 0; + last.common.type = OPAL_DATATYPE_LOOP; + } + + /* Can we unroll the loop? */ + if( (loop->items <= 3) && (loop->loops <= 2) ) { + ptrdiff_t elem_displ = 0; + for( uint32_t i = 0; i < loop->loops; i++ ) { + for( uint32_t j = 0; j < (loop->items - 1); j++ ) { + current = &pData->desc.desc[pos_desc + index + j].elem; + CREATE_ELEM( pElemDesc, current->common.type, current->common.flags, + current->blocklen, current->count, current->disp + elem_displ, current->extent ); pElemDesc++; nbElems++; - pos_desc += loop->items + 1; - goto complete_loop; - } else if( loop->loops < 3 ) { - ptrdiff_t elem_displ = elem->disp; - for( i = 0; i < loop->loops; i++ ) { - CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags, - elem->count, elem_displ, elem->extent ); - elem_displ += loop->extent; - pElemDesc++; nbElems++; - } - pos_desc += loop->items + 1; - goto complete_loop; } + elem_displ += loop->extent; } - CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); - pElemDesc++; nbElems++; - PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp ); - pos_desc++; - DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" ); + pos_desc += loop->items + 1; + goto complete_loop; } + + CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); + pElemDesc++; nbElems++; + PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp ); + pos_desc++; + DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" ); + complete_loop: total_disp = pStack->disp; /* update the displacement */ continue; } - while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* keep doing it until we reach a non datatype element */ - /* now here we have a basic datatype */ - type = pData->desc.desc[pos_desc].elem.common.type; - continuity = ((last_disp + (ptrdiff_t)last_length * (ptrdiff_t)opal_datatype_basicDatatypes[last_type]->size) - == (total_disp + pData->desc.desc[pos_desc].elem.disp)); + while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* go over all basic datatype elements */ + current = &pData->desc.desc[pos_desc].elem; + pos_desc++; /* point to the next element as current points to the current one */ - if( (pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && continuity && - (pData->desc.desc[pos_desc].elem.extent == (int32_t)opal_datatype_basicDatatypes[type]->size) ) { - if( type == last_type ) { - last_length += pData->desc.desc[pos_desc].elem.count; - last_extent = pData->desc.desc[pos_desc].elem.extent; + fuse_loops: + if( 0 == last.count ) { /* first data of the datatype */ + last = *current; + continue; /* next data */ + } + + /* are the two elements compatible: aka they have very similar values and they + * can be merged together by increasing the count. This optimizes the memory + * required for storing the datatype description. + */ + if( ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == + (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)) && + (current->disp == (last.disp + (ptrdiff_t)last.count * last.extent)) && + ((last.count == 1) || (current->count == 1) || (last.extent == current->extent)) ) { + last.count += current->count; + if( last.count == 1 ) { + last.extent = current->extent; + } /* otherwise keep the last.extent */ + /* find the lowest common denomitaor type */ + if( last.common.type != current->common.type ) { + last.common.type = OPAL_DATATYPE_UINT1; + last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; + } + continue; /* next data */ + } + /* are the elements fusionable such that we can fusion the last blocklen of one with the first + * blocklen of the other. + */ + if( (ptrdiff_t)(last.disp + (last.count - 1) * last.extent + last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == + current->disp ) { + if( last.count != 1 ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count - 1, last.disp, last.extent ); + pElemDesc++; nbElems++; + last.disp += (last.count - 1) * last.extent; + last.count = 1; + } + if( last.common.type == current->common.type ) { + last.blocklen += current->blocklen; } else { - if( last_length == 0 ) { - last_type = type; - last_length = pData->desc.desc[pos_desc].elem.count; - last_extent = pData->desc.desc[pos_desc].elem.extent; - } else { - last_length = last_length * opal_datatype_basicDatatypes[last_type]->size + - pData->desc.desc[pos_desc].elem.count * opal_datatype_basicDatatypes[type]->size; - last_type = OPAL_DATATYPE_UINT1; - last_extent = 1; - } + last.blocklen = ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) + + (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)); + last.common.type = OPAL_DATATYPE_UINT1; } - last_flags &= pData->desc.desc[pos_desc].elem.common.flags; - } else { - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); + last.extent += current->extent; + if( current->count != 1 ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; + last = *current; + last.count -= 1; + last.disp += last.extent; } - last_disp = total_disp + pData->desc.desc[pos_desc].elem.disp; - last_length = pData->desc.desc[pos_desc].elem.count; - last_extent = pData->desc.desc[pos_desc].elem.extent; - last_type = type; + continue; } - pos_desc++; /* advance to the next data */ + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); + pElemDesc++; nbElems++; + last = *current; } } - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); + if( 0 != last.count ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; } /* cleanup the stack */ diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index f952cabc3c0..66259f8b66b 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -19,8 +19,6 @@ #include "opal_config.h" -#include - #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT /* Make use of existing macro to do CUDA style memcpy */ #undef MEMCPY_CSUM @@ -28,75 +26,117 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif -static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, - const dt_elem_desc_t* ELEM, - size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, - size_t* SPACE ) +static inline void +pack_predefined_data( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** memory, + unsigned char** packed, + size_t* SPACE ) { - size_t _copy_count = *(COUNT); - size_t _copy_blength; const ddt_elem_desc_t* _elem = &((ELEM)->elem); - unsigned char* _source = (*SOURCE) + _elem->disp; + size_t total_count = _elem->count * _elem->blocklen; + size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now, do_now_bytes; + unsigned char* _memory = (*memory) + _elem->disp; + + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + if( cando_count > *(COUNT) ) + cando_count = *(COUNT); + + /** + * First check if we already did something on this element ? + */ + do_now = (total_count - *(COUNT)); /* done elements */ + if( 0 != do_now ) { + do_now = do_now % _elem->blocklen; /* partial blocklen? */ + + if( 0 != do_now ) { + size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ + do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_count * _copy_blength) > *(SPACE) ) { - _copy_count = (*(SPACE) / _copy_blength); - if( 0 == _copy_count ) return; /* nothing to do */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + cando_count -= do_now; + } } - if( (ptrdiff_t)_copy_blength == _elem->extent ) { - _copy_blength *= _copy_count; - /* the extent and the size of the basic datatype are equal */ - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); - _source += _copy_blength; - *(DESTINATION) += _copy_blength; - } else { - for(size_t _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = cando_count / _elem->blocklen; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); - *(DESTINATION) += _copy_blength; - _source += _elem->extent; + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); + MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + *(packed) += do_now_bytes; + _memory += _elem->extent; + *(SPACE) -= do_now_bytes; + *(COUNT) -= _elem->blocklen; + cando_count -= _elem->blocklen; } - _copy_blength *= _copy_count; } - *(SOURCE) = _source - _elem->disp; - *(SPACE) -= _copy_blength; - *(COUNT) -= _copy_count; + + /** + * As an epilog do anything left from the last blocklen. + */ + do_now = cando_count; + if( 0 != do_now ) { + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + _memory += do_now_bytes; + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + } + + *(memory) = _memory - _elem->disp; } static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, + unsigned char** memory, + unsigned char** packed, size_t* SPACE ) { const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); - unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp; + unsigned char* _memory = (*memory) + _end_loop->first_elem_disp; size_t _copy_loops = *(COUNT); if( (_copy_loops * _end_loop->size) > *(SPACE) ) _copy_loops = (*(SPACE) / _end_loop->size); for(size_t _i = 0; _i < _copy_loops; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _end_loop->size, (CONVERTOR)->pBaseBuf, + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, _end_loop->size, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) ); - *(DESTINATION) += _end_loop->size; - _source += _loop->extent; + (void*)*(packed), (void*)_memory, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); + MEMCPY_CSUM( *(packed), _memory, _end_loop->size, (CONVERTOR) ); + *(packed) += _end_loop->size; + _memory += _loop->extent; } - *(SOURCE) = _source - _end_loop->first_elem_disp; + *(memory) = _memory - _end_loop->first_elem_disp; *(SPACE) -= _copy_loops * _end_loop->size; *(COUNT) -= _copy_loops; } @@ -104,12 +144,12 @@ static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, #define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ ELEM, /* the basic element to be packed */ \ COUNT, /* the number of elements */ \ - SOURCE, /* the source pointer (char*) */ \ - DESTINATION, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + PACKED, /* the destination pointer (char*) */ \ SPACE ) /* the space in the destination buffer */ \ -pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) -#define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ - pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +#define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, MEMORY, PACKED, SPACE ) \ + pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) #endif /* OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index 3b8eaec69c6..381a31086d6 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -61,22 +61,77 @@ position_predefined_data( opal_convertor_t* CONVERTOR, unsigned char** POINTER, size_t* SPACE ) { - size_t _copy_count = *(COUNT); - size_t _copy_blength; - ddt_elem_desc_t* _elem = &((ELEM)->elem); + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t total_count = _elem->count * _elem->blocklen; + size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now, do_now_bytes; + unsigned char* _memory = (*POINTER) + _elem->disp; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_count * _copy_blength) > *(SPACE) ) { - _copy_count = *(SPACE) / _copy_blength; - if( 0 == _copy_count ) return; /* nothing to do */ + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + if( cando_count > *(COUNT) ) + cando_count = *(COUNT); + + /** + * First check if we already did something on this element ? + */ + do_now = (total_count - *(COUNT)); /* done elements */ + if( 0 != do_now ) { + do_now = do_now % _elem->blocklen; /* partial blocklen? */ + + if( 0 != do_now ) { + size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ + do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + _memory = *(POINTER) + _elem->disp + (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + cando_count -= do_now; + } + } + + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = cando_count / _elem->blocklen; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu\n", + (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); + _memory += _elem->extent; + *(SPACE) -= do_now_bytes; + *(COUNT) -= _elem->blocklen; + cando_count -= _elem->blocklen; + } + } + + /** + * As an epilog do anything left from the last blocklen. + */ + do_now = cando_count; + if( 0 != do_now ) { + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [epilog]\n", + (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + _memory += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; } - _copy_blength *= _copy_count; - OPAL_DATATYPE_SAFEGUARD_POINTER( *(POINTER) + _elem->disp, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - *(POINTER) += (_copy_count * _elem->extent); - *(SPACE) -= _copy_blength; - *(COUNT) -= _copy_count; + *(POINTER) = _memory - _elem->disp; } /** @@ -128,8 +183,8 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, /* We dont want to have to parse the datatype multiple times. What we are interested in * here is to compute the number of completed datatypes that we can move forward, update - * the counters and finally compute the position taking in account only the remaining - * elements. The only problem is that we have to modify all the elements on the stack. + * the counters and compute the position taking in account only the remaining elements. + * The only problem is that we have to modify all the elements on the stack. */ iov_len_local = *position - pConvertor->bConverted; if( iov_len_local > pConvertor->pDesc->size ) { diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index d837aad5ab7..f51a609294d 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -27,83 +27,124 @@ #endif static inline void -unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */ - const dt_elem_desc_t* ELEM, /* the element description */ - size_t* COUNT, /* the number of elements */ - unsigned char** SOURCE, /* the source pointer */ - unsigned char** DESTINATION, /* the destination pointer */ - size_t* SPACE ) /* the space in the destination buffer */ +unpack_predefined_data( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** packed, + unsigned char** memory, + size_t* SPACE ) { - size_t _copy_count = *(COUNT); - size_t _copy_blength; const ddt_elem_desc_t* _elem = &((ELEM)->elem); - unsigned char* _destination = (*DESTINATION) + _elem->disp; + size_t total_count = _elem->count * _elem->blocklen; + size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now, do_now_bytes; + unsigned char* _memory = (*memory) + _elem->disp; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_count * _copy_blength) > *(SPACE) ) { - _copy_count = (*(SPACE) / _copy_blength); - if( 0 == _copy_count ) return; /* nothing to do */ + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + if( cando_count > *(COUNT) ) + cando_count = *(COUNT); + + /** + * First check if we already did something on this element ? + */ + do_now = (total_count - *(COUNT)); /* done elements */ + if( 0 != do_now ) { + do_now = do_now % _elem->blocklen; /* partial blocklen? */ + + if( 0 != do_now ) { + size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ + do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + cando_count -= do_now; + } } - if( (ptrdiff_t)_copy_blength == _elem->extent ) { - _copy_blength *= _copy_count; - /* the extent and the size of the basic datatype are equal */ - OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); - *(SOURCE) += _copy_blength; - _destination += _copy_blength; - } else { - for(size_t _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); - *(SOURCE) += _copy_blength; - _destination += _elem->extent; + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = cando_count / _elem->blocklen; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); + MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + *(packed) += do_now_bytes; + _memory += _elem->extent; + *(SPACE) -= do_now_bytes; + *(COUNT) -= _elem->blocklen; + cando_count -= _elem->blocklen; } - _copy_blength *= _copy_count; } - (*DESTINATION) = _destination - _elem->disp; - *(SPACE) -= _copy_blength; - *(COUNT) -= _copy_count; + + /** + * As an epilog do anything left from the last blocklen. + */ + do_now = cando_count; + if( 0 != do_now ) { + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + _memory += do_now_bytes; + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + } + + *(memory) = _memory - _elem->disp; } static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, + unsigned char** packed, + unsigned char** memory, size_t* SPACE ) { const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); - unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp; + unsigned char* _memory = (*memory) + _end_loop->first_elem_disp; size_t _copy_loops = *(COUNT); if( (_copy_loops * _end_loop->size) > *(SPACE) ) _copy_loops = (*(SPACE) / _end_loop->size); for(size_t _i = 0; _i < _copy_loops; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _end_loop->size, (CONVERTOR)->pBaseBuf, + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, _end_loop->size, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_destination, (void*)*(SOURCE), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _end_loop->size, (CONVERTOR) ); - *(SOURCE) += _end_loop->size; - _destination += _loop->extent; + (void*)_memory, (void*)*(packed), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); + MEMCPY_CSUM( _memory, *(packed), _end_loop->size, (CONVERTOR) ); + *(packed) += _end_loop->size; + _memory += _loop->extent; } - *(DESTINATION) = _destination - _end_loop->first_elem_disp; - *(SPACE) -= _copy_loops * _end_loop->size; - *(COUNT) -= _copy_loops; + *(memory) = _memory - _end_loop->first_elem_disp; + *(SPACE) -= _copy_loops * _end_loop->size; + *(COUNT) -= _copy_loops; } -#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ - unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ + unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) -#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ - unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ + unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) #endif /* OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED */ From 8b794235b8d9882154f45734f1290a4eeedfe4c6 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 8 May 2019 13:08:48 -0400 Subject: [PATCH 03/14] Update the datatype dump to match the actual types. Update the comments to better reflect what is going on. Minor indentations. Signed-off-by: George Bosilca --- ompi/datatype/ompi_datatype_module.c | 18 ++++---- opal/datatype/opal_convertor.c | 6 +-- opal/datatype/opal_convertor_raw.c | 30 ++++++------- opal/datatype/opal_datatype_dump.c | 36 +++++++-------- test/datatype/ddt_raw.c | 66 +++++++++++++++------------- 5 files changed, 80 insertions(+), 76 deletions(-) diff --git a/ompi/datatype/ompi_datatype_module.c b/ompi/datatype/ompi_datatype_module.c index 3ee09173cd8..33e8d9b9e92 100644 --- a/ompi/datatype/ompi_datatype_module.c +++ b/ompi/datatype/ompi_datatype_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -736,14 +736,14 @@ void ompi_datatype_dump( const ompi_datatype_t* pData ) length = length * 100 + 500; buffer = (char*)malloc( length ); index += snprintf( buffer, length - index, - "Datatype %p[%s] id %d size %ld align %d opal_id %d length %d used %d\n" - "true_lb %ld true_ub %ld (true_extent %ld) lb %ld ub %ld (extent %ld)\n" - "nbElems %d loops %d flags %X (", - (void*)pData, pData->name, pData->id, - (long)pData->super.size, (int)pData->super.align, pData->super.id, (int)pData->super.desc.length, (int)pData->super.desc.used, - (long)pData->super.true_lb, (long)pData->super.true_ub, (long)(pData->super.true_ub - pData->super.true_lb), - (long)pData->super.lb, (long)pData->super.ub, (long)(pData->super.ub - pData->super.lb), - (int)pData->super.nbElems, (int)pData->super.loops, (int)pData->super.flags ); + "Datatype %p[%s] id %d size %" PRIsize_t " align %u opal_id %u length %" PRIsize_t " used %" PRIsize_t "\n" + "true_lb %td true_ub %td (true_extent %td) lb %td ub %td (extent %td)\n" + "nbElems %" PRIsize_t " loops %u flags %X (", + (void*)pData, pData->name, pData->id, + pData->super.size, pData->super.align, (uint32_t)pData->super.id, pData->super.desc.length, pData->super.desc.used, + pData->super.true_lb, pData->super.true_ub, pData->super.true_ub - pData->super.true_lb, + pData->super.lb, pData->super.ub, pData->super.ub - pData->super.lb, + pData->super.nbElems, pData->super.loops, (int)pData->super.flags ); /* dump the flags */ if( ompi_datatype_is_predefined(pData) ) { index += snprintf( buffer + index, length - index, "predefined " ); diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 631d3adab43..331cb95a715 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -700,12 +700,12 @@ int opal_convertor_clone( const opal_convertor_t* source, void opal_convertor_dump( opal_convertor_t* convertor ) { - opal_output( 0, "Convertor %p count %" PRIsize_t" stack position %d bConverted %" PRIsize_t "\n" - "\tlocal_size %ld remote_size %ld flags %X stack_size %d pending_length %" PRIsize_t "\n" + opal_output( 0, "Convertor %p count %" PRIsize_t " stack position %u bConverted %" PRIsize_t "\n" + "\tlocal_size %" PRIsize_t " remote_size %" PRIsize_t " flags %X stack_size %u pending_length %" PRIsize_t "\n" "\tremote_arch %u local_arch %u\n", (void*)convertor, convertor->count, convertor->stack_pos, convertor->bConverted, - (unsigned long)convertor->local_size, (unsigned long)convertor->remote_size, + convertor->local_size, convertor->remote_size, convertor->flags, convertor->stack_size, convertor->partial_length, convertor->remoteArch, opal_local_arch ); if( convertor->flags & CONVERTOR_RECV ) opal_output( 0, "unpack "); diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 28022809679..3c2073155b2 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -32,13 +32,13 @@ /** * This function always work in local representation. This means no representation - * conversion (i.e. no heterogeneity) has to be taken into account, and that all + * conversion (i.e. no heterogeneity) is taken into account, and that all * length we're working on are local. */ int32_t opal_convertor_raw( opal_convertor_t* pConvertor, - struct iovec* iov, uint32_t* iov_count, - size_t* length ) + struct iovec* iov, uint32_t* iov_count, + size_t* length ) { const opal_datatype_t *pData = pConvertor->pDesc; dt_stack_t* pStack; /* pointer to the position on the stack */ @@ -77,9 +77,9 @@ opal_convertor_raw( opal_convertor_t* pConvertor, description = pConvertor->use_desc->desc; /* For the first step we have to add both displacement to the source. After in the - * main while loop we will set back the source_base to the correct value. This is - * due to the fact that the convertor can stop in the middle of a data with a count - */ + * main while loop we will set back the source_base to the correct value. This is + * due to the fact that the convertor can stop in the middle of a data with a count + */ pStack = pConvertor->pStack + pConvertor->stack_pos; pos_desc = pStack->index; source_base = pConvertor->pBaseBuf + pStack->disp; @@ -101,9 +101,9 @@ opal_convertor_raw( opal_convertor_t* pConvertor, blength *= count_desc; /* now here we have a basic datatype */ OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); + pConvertor->pDesc, pConvertor->count ); DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", - index, (void*)source_base, (unsigned long)blength ); ); + index, (void*)source_base, blength ); ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; iov[index].iov_len = blength; source_base += blength; @@ -114,9 +114,9 @@ opal_convertor_raw( opal_convertor_t* pConvertor, } else { for(size_t i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); + pConvertor->pDesc, pConvertor->count ); DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", - index, (void*)source_base, (unsigned long)blength ); ); + index, (void*)source_base, blength ); ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; iov[index].iov_len = blength; source_base += pElem->elem.extent; @@ -141,8 +141,8 @@ opal_convertor_raw( opal_convertor_t* pConvertor, if( --(pStack->count) == 0 ) { /* end of loop */ if( pConvertor->stack_pos == 0 ) { /* we lie about the size of the next element in order to - * make sure we exit the main loop. - */ + * make sure we exit the main loop. + */ *iov_count = index; goto complete_loop; /* completed */ } @@ -174,7 +174,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, source_base += offset; for(size_t i = MIN(count_desc, *iov_count - index); i > 0; i--, index++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); + pConvertor->pDesc, pConvertor->count ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; iov[index].iov_len = end_loop->size; source_base += pElem->loop.extent; @@ -198,14 +198,14 @@ opal_convertor_raw( opal_convertor_t* pConvertor, PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, pStack->disp + local_disp); pos_desc++; - update_loop_description: /* update the current state */ + update_loop_description: /* update the current state */ source_base = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); continue; } } -complete_loop: + complete_loop: pConvertor->bConverted += raw_data; /* update the already converted bytes */ *length = raw_data; *iov_count = index; diff --git a/opal/datatype/opal_datatype_dump.c b/opal/datatype/opal_datatype_dump.c index 4c26292b8be..7782a805d0a 100644 --- a/opal/datatype/opal_datatype_dump.c +++ b/opal/datatype/opal_datatype_dump.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -64,7 +64,7 @@ int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t len int index = 0; if( length < 22 ) return 0; index = snprintf( ptr, 22, "-----------[---][---]" ); /* set everything to - */ - if( usflags & OPAL_DATATYPE_FLAG_COMMITTED ) ptr[1] = 'c'; + if( usflags & OPAL_DATATYPE_FLAG_COMMITTED ) ptr[1] = 'c'; if( usflags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) ptr[2] = 'C'; if( usflags & OPAL_DATATYPE_FLAG_OVERLAP ) ptr[3] = 'o'; if( usflags & OPAL_DATATYPE_FLAG_USER_LB ) ptr[4] = 'l'; @@ -90,17 +90,17 @@ int opal_datatype_dump_data_desc( dt_elem_desc_t* pDesc, int nbElems, char* ptr, index += snprintf( ptr + index, length - index, "%15s ", opal_datatype_basicDatatypes[pDesc->elem.common.type]->name ); if( length <= (size_t)index ) break; if( OPAL_DATATYPE_LOOP == pDesc->elem.common.type ) - index += snprintf( ptr + index, length - index, "%d times the next %d elements extent %d\n", - (int)pDesc->loop.loops, (int)pDesc->loop.items, - (int)pDesc->loop.extent ); + index += snprintf( ptr + index, length - index, "%u times the next %u elements extent %td\n", + pDesc->loop.loops, pDesc->loop.items, + pDesc->loop.extent ); else if( OPAL_DATATYPE_END_LOOP == pDesc->elem.common.type ) - index += snprintf( ptr + index, length - index, "prev %d elements first elem displacement %ld size of data %d\n", - (int)pDesc->end_loop.items, (long)pDesc->end_loop.first_elem_disp, - (int)pDesc->end_loop.size ); + index += snprintf( ptr + index, length - index, "prev %u elements first elem displacement %td size of data %" PRIsize_t "\n", + pDesc->end_loop.items, pDesc->end_loop.first_elem_disp, + pDesc->end_loop.size ); else - index += snprintf( ptr + index, length - index, "count %" PRIsize_t " disp 0x%lx (%ld) blen %d extent %ld (size %ld)\n", - pDesc->elem.count, (long)pDesc->elem.disp, (long)pDesc->elem.disp, (int)pDesc->elem.blocklen, - pDesc->elem.extent, (long)(pDesc->elem.count * opal_datatype_basicDatatypes[pDesc->elem.common.type]->size) ); + index += snprintf( ptr + index, length - index, "count %" PRIsize_t " disp 0x%tx (%td) blen %u extent %td (size %zd)\n", + pDesc->elem.count, pDesc->elem.disp, pDesc->elem.disp, pDesc->elem.blocklen, + pDesc->elem.extent, (pDesc->elem.count * pDesc->elem.blocklen * opal_datatype_basicDatatypes[pDesc->elem.common.type]->size) ); pDesc++; if( length <= (size_t)index ) break; @@ -118,13 +118,13 @@ void opal_datatype_dump( const opal_datatype_t* pData ) length = pData->opt_desc.used + pData->desc.used; length = length * 100 + 500; buffer = (char*)malloc( length ); - index += snprintf( buffer, length - index, "Datatype %p[%s] size %ld align %d id %d length %d used %d\n" - "true_lb %ld true_ub %ld (true_extent %ld) lb %ld ub %ld (extent %ld)\n" - "nbElems %" PRIsize_t " loops %d flags %X (", - (void*)pData, pData->name, (long)pData->size, (int)pData->align, pData->id, (int)pData->desc.length, (int)pData->desc.used, - (long)pData->true_lb, (long)pData->true_ub, (long)(pData->true_ub - pData->true_lb), - (long)pData->lb, (long)pData->ub, (long)(pData->ub - pData->lb), - pData->nbElems, (int)pData->loops, (int)pData->flags ); + index += snprintf( buffer, length - index, "Datatype %p[%s] size %" PRIsize_t " align %u id %u length %" PRIsize_t " used %" PRIsize_t "\n" + "true_lb %td true_ub %td (true_extent %td) lb %td ub %td (extent %td)\n" + "nbElems %" PRIsize_t " loops %u flags %X (", + (void*)pData, pData->name, pData->size, pData->align, (uint32_t)pData->id, pData->desc.length, pData->desc.used, + pData->true_lb, pData->true_ub, pData->true_ub - pData->true_lb, + pData->lb, pData->ub, pData->ub - pData->lb, + pData->nbElems, pData->loops, (int)pData->flags ); /* dump the flags */ if( pData->flags == OPAL_DATATYPE_FLAG_PREDEFINED ) index += snprintf( buffer + index, length - index, "predefined " ); diff --git a/test/datatype/ddt_raw.c b/test/datatype/ddt_raw.c index de35d6b83f4..bba285ceea0 100644 --- a/test/datatype/ddt_raw.c +++ b/test/datatype/ddt_raw.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -71,7 +71,7 @@ static int test_upper( unsigned int length ) iov_count = 5; max_data = 0; opal_convertor_raw( pConv, iov, &iov_count, &max_data ); - i -= max_data; + i -= max_data; } GET_TIME( end ); total_time = ELAPSED_TIME( start, end ); @@ -85,12 +85,12 @@ static int test_upper( unsigned int length ) } /** - * Conversion function. They deal with data-types in 3 ways, always making local copies. + * Conversion function. They deal with datatypes in 3 ways, always making local copies. * In order to allow performance testings, there are 3 functions: * - one copying directly from one memory location to another one using the - * data-type copy function. - * - one which use a 2 convertors created with the same data-type - * - and one using 2 convertors created from different data-types. + * datatype copy function. + * - one which use a 2 convertors created with the same datatype + * - and one using 2 convertors created from different datatypes. * */ static int local_copy_ddt_raw( ompi_datatype_t* pdt, int count, int iov_num ) @@ -114,13 +114,13 @@ static int local_copy_ddt_raw( ompi_datatype_t* pdt, int count, int iov_num ) GET_TIME( start ); while( 0 == opal_convertor_raw(convertor, iov, &iov_count, &max_data) ) { #if 0 - printf( "New raw extraction (iov_count = %d, max_data = %zu)\n", - iov_count, max_data ); - for( i = 0; i < iov_count; i++ ) { - printf( "\t{%p, %d}\n", iov[i].iov_base, iov[i].iov_len ); - } + printf( "New raw extraction (iov_count = %d, max_data = %zu)\n", + iov_count, max_data ); + for( i = 0; i < iov_count; i++ ) { + printf( "\t{%p, %d}\n", iov[i].iov_base, iov[i].iov_len ); + } #endif - remaining_length -= max_data; + remaining_length -= max_data; iov_count = iov_num; } remaining_length -= max_data; @@ -129,19 +129,23 @@ static int local_copy_ddt_raw( ompi_datatype_t* pdt, int count, int iov_num ) printf( "raw extraction in %ld microsec\n", total_time ); OBJ_RELEASE( convertor ); if( remaining_length != 0 ) { - printf( "Not all raw description was been extracted (%lu bytes missing)\n", - (unsigned long) remaining_length ); + printf( "Not all raw description was been extracted (%lu bytes missing)\n", + (unsigned long) remaining_length ); } free(iov); return OMPI_SUCCESS; } /** - * Main function. Call several tests and print-out the results. It try to stress the convertor - * using difficult data-type constructions as well as strange segment sizes for the conversion. - * Usually, it is able to detect most of the data-type and convertor problems. Any modifications - * on the data-type engine should first pass all the tests from this file, before going into other - * tests. + * Go over a set of datatypes and copy them using the raw functionality provided by the + * convertor. The goal of this test is to stress the convertor using several more or less + * difficult datatype, with a large set of segment sizes for the conversion. It can be used + * to highlight the raw capability of the convertor as well as detecting datatype convertor + * problems. + * + * This test is part of the testing infrastructure for the core datatype engine. As such any + * modifications on the datatype engine should first pass all the tests from this file, + * before going into other tests. */ int main( int argc, char* argv[] ) { @@ -226,7 +230,7 @@ int main( int argc, char* argv[] ) OBJ_RELEASE( pdt3 ); assert( pdt3 == NULL ); printf( ">>--------------------------------------------<<\n" ); - printf( " Contiguous data-type (MPI_DOUBLE)\n" ); + printf( " Contiguous datatype (MPI_DOUBLE)\n" ); pdt = MPI_DOUBLE; if( outputFlags & CHECK_PACK_UNPACK ) { local_copy_ddt_raw(pdt, 4500, iov_num); @@ -235,37 +239,37 @@ int main( int argc, char* argv[] ) printf( ">>--------------------------------------------<<\n" ); if( outputFlags & CHECK_PACK_UNPACK ) { - printf( "Contiguous multiple data-type (4500*1)\n" ); + printf( "Contiguous multiple datatype (4500*1)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 4500 ); local_copy_ddt_raw(pdt, 1, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (450*10)\n" ); + printf( "Contiguous multiple datatype (450*10)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 450 ); local_copy_ddt_raw(pdt, 10, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (45*100)\n" ); + printf( "Contiguous multiple datatype (45*100)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 45 ); local_copy_ddt_raw(pdt, 100, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (100*45)\n" ); + printf( "Contiguous multiple datatype (100*45)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 100 ); local_copy_ddt_raw(pdt, 45, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (10*450)\n" ); + printf( "Contiguous multiple datatype (10*450)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 10 ); local_copy_ddt_raw(pdt, 450, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (1*4500)\n" ); + printf( "Contiguous multiple datatype (1*4500)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 1 ); local_copy_ddt_raw(pdt, 4500, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); } printf( ">>--------------------------------------------<<\n" ); printf( ">>--------------------------------------------<<\n" ); - printf( "Vector data-type (450 times 10 double stride 11)\n" ); + printf( "Vector datatype (450 times 10 double stride 11)\n" ); pdt = create_vector_type( MPI_DOUBLE, 450, 10, 11 ); if( outputFlags & DUMP_DATA_AFTER_COMMIT ) { - ompi_datatype_dump( pdt ); + ompi_datatype_dump( pdt ); } if( outputFlags & CHECK_PACK_UNPACK ) { local_copy_ddt_raw(pdt, 1, iov_num); @@ -292,9 +296,9 @@ int main( int argc, char* argv[] ) printf( ">>--------------------------------------------<<\n" ); pdt = test_create_blacs_type(); if( outputFlags & CHECK_PACK_UNPACK ) { - if( outputFlags & DUMP_DATA_AFTER_COMMIT ) { - ompi_datatype_dump( pdt ); - } + if( outputFlags & DUMP_DATA_AFTER_COMMIT ) { + ompi_datatype_dump( pdt ); + } local_copy_ddt_raw(pdt, 4500, iov_num); } printf( ">>--------------------------------------------<<\n" ); From 4cdc2155e540d13b3145aa5cc095f4d7282c072d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 20 May 2019 11:39:16 -0400 Subject: [PATCH 04/14] Optimize the raw representation. Merge contiguous iov in order to minimize the number of returned iovec. Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_raw.c | 175 ++++++++++++++++++----------- 1 file changed, 109 insertions(+), 66 deletions(-) diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 3c2073155b2..df2340122a9 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -30,6 +30,29 @@ #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ +/* Take a new iovec (base + len) and try to merge it with what we already + * have. If we succeed return 0 and move forward, if not save it into a new + * iovec location. If we need to go to a new position and we reach the end + * of the iovec array, return 1 to signal we did not saved the last iovec. + */ +static inline int +opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count, + IOVBASE_TYPE* base, size_t len, + uint32_t* idx ) +{ + if( 0 != iov[*idx].iov_len ) { + if( (base == ((char*)iov[*idx].iov_base + iov[*idx].iov_len)) ) { + iov[*idx].iov_len += len; /* merge with previous iovec */ + return 0; + } /* cannot merge, move to the next position */ + *idx = *idx + 1; + if( *idx == *iov_count ) return 1; /* do not overwrite outside the iove array boundaries */ + } + iov[*idx].iov_base = base; + iov[*idx].iov_len = len; + return 0; +} + /** * This function always work in local representation. This means no representation * conversion (i.e. no heterogeneity) is taken into account, and that all @@ -44,10 +67,11 @@ opal_convertor_raw( opal_convertor_t* pConvertor, dt_stack_t* pStack; /* pointer to the position on the stack */ uint32_t pos_desc; /* actual position in the description of the derived datatype */ size_t count_desc; /* the number of items already done in the actual pos_desc */ + size_t do_now, blength; dt_elem_desc_t* description, *pElem; unsigned char *source_base; /* origin of the data */ - size_t raw_data = 0; /* sum of raw data lengths in the iov_len fields */ - uint32_t index = 0; /* the iov index and a simple counter */ + size_t sum_iov_len = 0; /* sum of raw data lengths in the iov_len fields */ + uint32_t index = 0; /* the iov index and a simple counter */ assert( (*iov_count) > 0 ); if( OPAL_LIKELY(pConvertor->flags & CONVERTOR_COMPLETED) ) { @@ -87,64 +111,86 @@ opal_convertor_raw( opal_convertor_t* pConvertor, pStack--; pConvertor->stack_pos--; pElem = &(description[pos_desc]); - source_base += pStack->disp; + DO_DEBUG( opal_output( 0, "raw start pos_desc %d count_desc %" PRIsize_t " disp %ld\n" "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n", pos_desc, count_desc, (long)(source_base - pConvertor->pBaseBuf), pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); ); + + iov[index].iov_len = 0; + /* Special case if we start from a position that is in the middle of a data element blocklen. + * We can treat this outside the loop as it is an exception that can only happen once, + * and will simplify the loop handling. + */ + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + const ddt_elem_desc_t* current = &(pElem->elem); + + if( count_desc != (current->count * current->blocklen) ) { /* Not the full element description */ + do_now = current->blocklen - (count_desc % current->blocklen); /* how much left in the block */ + if( do_now ) { + source_base += current->disp; + blength = do_now * opal_datatype_basicDatatypes[current->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, + pConvertor->pDesc, pConvertor->count ); + DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", + index, (void*)source_base, blength ); ); + opal_convertor_merge_iov( iov, iov_count, + (IOVBASE_TYPE *) source_base, blength, &index ); + /* not check the return value, we know there was at least one element in the iovec */ + sum_iov_len += blength; + count_desc -= do_now; + + source_base += (current->extent - current->disp + + (current->blocklen - do_now) * opal_datatype_basicDatatypes[current->common.type]->size); + } + } + } + while( 1 ) { while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - size_t blength = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; - source_base += pElem->elem.disp; - if( blength == (size_t)pElem->elem.extent ) { /* no resized data */ - if( index < *iov_count ) { - blength *= count_desc; - /* now here we have a basic datatype */ - OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); - DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", - index, (void*)source_base, blength ); ); - iov[index].iov_base = (IOVBASE_TYPE *) source_base; - iov[index].iov_len = blength; - source_base += blength; - raw_data += blength; - index++; - count_desc = 0; - } - } else { - for(size_t i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); - DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", - index, (void*)source_base, blength ); ); - iov[index].iov_base = (IOVBASE_TYPE *) source_base; - iov[index].iov_len = blength; - source_base += pElem->elem.extent; - raw_data += blength; - count_desc--; - } + const ddt_elem_desc_t* current = &(pElem->elem); + source_base += current->disp; + + do_now = current->count; + if( count_desc != (current->count * current->blocklen) ) { + do_now = count_desc / current->blocklen; + assert( 0 == (count_desc % current->blocklen) ); } - source_base -= pElem->elem.disp; + + blength = current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size; + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, + pConvertor->pDesc, pConvertor->count ); + DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", + index, (void*)source_base, blength ); ); + if( opal_convertor_merge_iov( iov, iov_count, + (IOVBASE_TYPE *) source_base, blength, &index ) ) + break; /* no more iovec available, bail out */ + + source_base += current->extent; + sum_iov_len += blength; + count_desc -= current->blocklen; + } + if( 0 == count_desc ) { /* completed */ source_base = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); continue; } + source_base -= current->disp; goto complete_loop; } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "raw end_loop count %" PRIsize_t " stack_pos %d" - " pos_desc %d disp %ld space %lu\n", + " pos_desc %d disp %ld space %" PRIsize_t "\n", pStack->count, pConvertor->stack_pos, - pos_desc, (long)pStack->disp, (unsigned long)raw_data ); ); + pos_desc, (long)pStack->disp, sum_iov_len ); ); if( --(pStack->count) == 0 ) { /* end of loop */ - if( pConvertor->stack_pos == 0 ) { - /* we lie about the size of the next element in order to - * make sure we exit the main loop. - */ - *iov_count = index; - goto complete_loop; /* completed */ + if( 0 == pConvertor->stack_pos ) { + /* we're done. Force the exit of the main for loop (around iovec) */ + index++; /* account for the currently updating iovec */ + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -155,15 +201,15 @@ opal_convertor_raw( opal_convertor_t* pConvertor, pStack->disp += (pData->ub - pData->lb); } else { assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); - pStack->disp += description[pStack->index].loop.extent; + pStack->disp += description[pStack->index].loop.extent; /* jump by the loop extent */ } } source_base = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DO_DEBUG( opal_output( 0, "raw new_loop count %" PRIsize_t " stack_pos %d " - "pos_desc %d disp %ld space %lu\n", + "pos_desc %d disp %ld space %" PRIsize_t "\n", pStack->count, pConvertor->stack_pos, - pos_desc, (long)pStack->disp, (unsigned long)raw_data ); ); + pos_desc, (long)pStack->disp, sum_iov_len ); ); } if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { ptrdiff_t local_disp = (ptrdiff_t)source_base; @@ -172,42 +218,39 @@ opal_convertor_raw( opal_convertor_t* pConvertor, if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { ptrdiff_t offset = end_loop->first_elem_disp; source_base += offset; - for(size_t i = MIN(count_desc, *iov_count - index); i > 0; i--, index++ ) { + for(; count_desc > 0; ) { OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf, pConvertor->pDesc, pConvertor->count ); - iov[index].iov_base = (IOVBASE_TYPE *) source_base; - iov[index].iov_len = end_loop->size; + if( opal_convertor_merge_iov( iov, iov_count, + (IOVBASE_TYPE *) source_base, end_loop->size, &index ) ) { + source_base -= offset; + goto complete_loop; + } + source_base += pElem->loop.extent; - raw_data += end_loop->size; + sum_iov_len += end_loop->size; count_desc--; DO_DEBUG( opal_output( 0, "raw contig loop generate iov[%d] = {base %p, length %" PRIsize_t "}" - "space %lu [pos_desc %d]\n", + "space %" PRIsize_t " [pos_desc %d]\n", index, iov[index].iov_base, iov[index].iov_len, - (unsigned long)raw_data, pos_desc ); ); + sum_iov_len, pos_desc ); ); } source_base -= offset; - if( 0 == count_desc ) { /* completed */ - pos_desc += pElem->loop.items + 1; - goto update_loop_description; - } - } - if( index == *iov_count ) { /* all iov have been filled, we need to bail out */ - goto complete_loop; + pos_desc += pElem->loop.items + 1; + } else { + local_disp = (ptrdiff_t)source_base - local_disp; + PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, + pStack->disp + local_disp); + pos_desc++; } - local_disp = (ptrdiff_t)source_base - local_disp; - PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, - pStack->disp + local_disp); - pos_desc++; - update_loop_description: /* update the current state */ source_base = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); - continue; } } complete_loop: - pConvertor->bConverted += raw_data; /* update the already converted bytes */ - *length = raw_data; + pConvertor->bConverted += sum_iov_len; /* update the already converted bytes */ + *length = sum_iov_len; *iov_count = index; if( pConvertor->bConverted == pConvertor->local_size ) { pConvertor->flags |= CONVERTOR_COMPLETED; From 0a00b02e4882cc0cf612128a715073ea3f9ce688 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sat, 18 May 2019 19:31:24 -0400 Subject: [PATCH 05/14] Small improvements on the test. Rework the to_self test to be able to be used as a benchmark. Signed-off-by: George Bosilca --- opal/datatype/opal_convertor.c | 2 +- opal/datatype/opal_datatype_optimize.c | 13 +- test/datatype/ddt_raw2.c | 29 +-- test/datatype/opal_datatype_test.c | 3 +- test/datatype/opal_ddt_lib.c | 4 +- test/datatype/to_self.c | 348 ++++++++++++++++--------- test/datatype/unpack_ooo.c | 21 +- 7 files changed, 267 insertions(+), 153 deletions(-) diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 331cb95a715..7a449302bff 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -357,7 +357,7 @@ opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor, */ if( OPAL_LIKELY(0 == count) ) { pStack[1].type = pElems->elem.common.type; - pStack[1].count = pElems->elem.count; + pStack[1].count = pElems->elem.blocklen; } else { pStack[1].type = OPAL_DATATYPE_UINT1; pStack[1].count = pData->size - count; diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c index 48ea0f3c78b..fbaacb592c2 100644 --- a/opal/datatype/opal_datatype_optimize.c +++ b/opal/datatype/opal_datatype_optimize.c @@ -167,15 +167,18 @@ opal_datatype_optimize_short( opal_datatype_t* pData, if( ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)) && (current->disp == (last.disp + (ptrdiff_t)last.count * last.extent)) && - ((last.count == 1) || (current->count == 1) || (last.extent == current->extent)) ) { + ((current->count == 1) || (last.extent == current->extent)) ) { last.count += current->count; - if( last.count == 1 ) { - last.extent = current->extent; - } /* otherwise keep the last.extent */ /* find the lowest common denomitaor type */ if( last.common.type != current->common.type ) { - last.common.type = OPAL_DATATYPE_UINT1; last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; + last.common.type = OPAL_DATATYPE_UINT1; + } + /* maximize the contiguous pieces */ + if( last.extent == (ptrdiff_t)(last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) ) { + last.blocklen *= last.count; + last.count = 1; + last.extent = last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size; } continue; /* next data */ } diff --git a/test/datatype/ddt_raw2.c b/test/datatype/ddt_raw2.c index cc78e23006a..7e91a323f7a 100644 --- a/test/datatype/ddt_raw2.c +++ b/test/datatype/ddt_raw2.c @@ -33,9 +33,6 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, uint32_t *iovec_count, int increment) { - - - opal_convertor_t *convertor; size_t remaining_length = 0; uint32_t i; @@ -43,7 +40,6 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, struct iovec *temp_iov=NULL; size_t temp_data; - convertor = opal_convertor_create( opal_local_arch, 0 ); if (OMPI_SUCCESS != opal_convertor_prepare_for_send (convertor, @@ -55,9 +51,9 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, } if ( 0 == datatype->super.size ) { - *iovec_count = 0; - *iov = NULL; - return OMPI_SUCCESS; + *iovec_count = 0; + *iov = NULL; + return OMPI_SUCCESS; } remaining_length = count * datatype->super.size; @@ -69,10 +65,8 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, return OMPI_ERR_OUT_OF_RESOURCE; } - while (0 == opal_convertor_raw(convertor, - temp_iov, - &temp_count, - &temp_data)) { + while (0 == opal_convertor_raw(convertor, temp_iov, + &temp_count, &temp_data)) { *iovec_count = *iovec_count + temp_count; *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); if (NULL == *iov) { @@ -80,7 +74,7 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, free(temp_iov); return OMPI_ERR_OUT_OF_RESOURCE; } - for (i=0 ; i 0 ) { - *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); - if (NULL == *iov) { - opal_output(1, "OUT OF MEMORY\n"); + *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); + if (NULL == *iov) { + opal_output(1, "OUT OF MEMORY\n"); free(temp_iov); - return OMPI_ERR_OUT_OF_RESOURCE; - } + return OMPI_ERR_OUT_OF_RESOURCE; + } } for (i=0 ; idesc.used + 2 ); - if( (bLength == stride) || (1 >= count) ) { /* the elements are contiguous */ + if( (bLength == stride) || (1 == count) ) { /* the elements are contiguous */ opal_datatype_add( pData, oldType, count * bLength, 0, extent ); } else { if( 1 == bLength ) { @@ -476,7 +476,7 @@ static int32_t opal_datatype_create_hvector( int count, int bLength, ptrdiff_t s } pTempData = opal_datatype_create( oldType->desc.used + 2 ); - if( ((extent * bLength) == stride) || (1 >= count) ) { /* contiguous */ + if( ((extent * bLength) == stride) || (1 == count) ) { /* contiguous */ pData = pTempData; opal_datatype_add( pData, oldType, count * bLength, 0, extent ); } else { diff --git a/test/datatype/to_self.c b/test/datatype/to_self.c index 58849f5e90c..073fe4f0b57 100644 --- a/test/datatype/to_self.c +++ b/test/datatype/to_self.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT$ @@ -15,8 +15,9 @@ #include #include #include +#include -#if OPEN_MPI && 0 +#if 0 && OPEN_MPI extern void ompi_datatype_dump( MPI_Datatype ddt ); #define MPI_DDT_DUMP(ddt) ompi_datatype_dump( (ddt) ) #else @@ -178,23 +179,145 @@ create_indexed_gap_optimized_ddt( void ) return dt3; } -static void print_result( int length, int cycles, double time ) -{ - double bandwidth, clock_prec; +/******************************************************************** + *******************************************************************/ + +#define DO_CONTIG 0x00000001 +#define DO_CONSTANT_GAP 0x00000002 +#define DO_INDEXED_GAP 0x00000004 +#define DO_OPTIMIZED_INDEXED_GAP 0x00000008 +#define DO_STRUCT_CONSTANT_GAP_RESIZED 0x00000010 + +#define DO_PACK 0x01000000 +#define DO_UNPACK 0x02000000 +#define DO_ISEND_RECV 0x04000000 +#define DO_ISEND_IRECV 0x08000000 +#define DO_IRECV_SEND 0x10000000 +#define DO_IRECV_ISEND 0x20000000 + +#define MIN_LENGTH 1024 +#define MAX_LENGTH (1024*1024) + +static int cycles = 100; +static int trials = 20; +static int warmups = 2; + +static void print_result( int length, int trials, double* timers ) +{ + double bandwidth, clock_prec, temp; + double min_time, max_time, average, std_dev = 0.0; + double ordered[trials]; + int t, pos, quartile_start, quartile_end; + + for( t = 0; t < trials; ordered[t] = timers[t], t++ ); + for( t = 0; t < trials-1; t++ ) { + temp = ordered[t]; + pos = t; + for( int i = t+1; i < trials; i++ ) { + if( temp > ordered[i] ) { + temp = ordered[i]; + pos = i; + } + } + if( pos != t ) { + temp = ordered[t]; + ordered[t] = ordered[pos]; + ordered[pos] = temp; + } + } + quartile_start = trials - (3 * trials) / 4; + quartile_end = trials - (1 * trials) / 4; clock_prec = MPI_Wtick(); - bandwidth = (length * clock_prec * cycles) / (1024.0 * 1024.0) / (time * clock_prec); - printf( "%8d\t%.6f\t%.4f MB/s\n", length, time / cycles, bandwidth ); + min_time = ordered[quartile_start]; + max_time = ordered[quartile_start]; + average = ordered[quartile_start]; + for( t = quartile_start + 1; t < quartile_end; t++ ) { + if( min_time > ordered[t] ) min_time = ordered[t]; + if( max_time < ordered[t] ) max_time = ordered[t]; + average += ordered[t]; + } + average /= (quartile_end - quartile_start); + for( t = quartile_start; t < quartile_end; t++ ) { + std_dev += (ordered[t] - average) * (ordered[t] - average); + } + std_dev = sqrt( std_dev/(quartile_end - quartile_start) ); + + bandwidth = (length * clock_prec) / (1024.0 * 1024.0) / (average * clock_prec); + printf( "%8d\t%15g\t%10.4f MB/s [min %10g max %10g std %2.2f%%]\n", length, average, bandwidth, + min_time, max_time, (100.0 * std_dev) / average ); +} + +static int pack( int cycles, + MPI_Datatype sdt, int scount, void* sbuf, + void* packed_buf ) +{ + int position, myself, c, t, outsize; + double timers[trials]; + + MPI_Type_size( sdt, &outsize ); + outsize *= scount; + + MPI_Comm_rank( MPI_COMM_WORLD, &myself ); + + for( t = 0; t < warmups; t++ ) { + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD); + } + } + + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; + } + print_result( outsize, trials, timers ); + return 0; +} + +static int unpack( int cycles, + void* packed_buf, + MPI_Datatype rdt, int rcount, void* rbuf ) +{ + int position, myself, c, t, insize; + double timers[trials]; + + MPI_Type_size( rdt, &insize ); + insize *= rcount; + + MPI_Comm_rank( MPI_COMM_WORLD, &myself ); + + for( t = 0; t < warmups; t++ ) { + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD); + } + } + + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; + } + print_result( insize, trials, timers ); + return 0; } static int isend_recv( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; + int myself, tag = 0, c, t, slength, rlength; MPI_Status status; MPI_Request req; - double tstart, tend; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -203,21 +326,16 @@ static int isend_recv( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); - MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); - MPI_Wait( &req, &status ); - /*MPI_Request_free( &req );*/ -#else - ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); - ftmpi_mpi_recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); - ftmpi_request_free( &req ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); + MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); + MPI_Wait( &req, &status ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers ); return 0; } @@ -225,10 +343,10 @@ static int irecv_send( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; + int myself, tag = 0, c, t, slength, rlength; MPI_Request req; MPI_Status status; - double tstart, tend; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -237,21 +355,16 @@ static int irecv_send( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); - MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); - MPI_Wait( &req, &status ); - /*MPI_Request_free( &req );*/ -#else - ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); - ftmpi_mpi_send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); - ftmpi_request_free( &req ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); + MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); + MPI_Wait( &req, &status ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers ); return 0; } @@ -259,10 +372,10 @@ static int isend_irecv_wait( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; - MPI_Request sreq, rreq; - MPI_Status status; - double tstart, tend; + int myself, tag = 0, c, t, slength, rlength; + MPI_Request requests[2]; + MPI_Status statuses[2]; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -271,25 +384,16 @@ static int isend_irecv_wait( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - MPI_Wait( &sreq, &status ); - MPI_Wait( &rreq, &status ); - /*MPI_Request_free( &sreq );*/ - /*MPI_Request_free( &rreq );*/ -#else - ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - ftmpi_wait( &sreq, &status ); - ftmpi_request_free( &sreq ); - ftmpi_request_free( &rreq ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[0] ); + MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[1] ); + MPI_Waitall( 2, requests, statuses ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers ); return 0; } @@ -297,10 +401,10 @@ static int irecv_isend_wait( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; - MPI_Request sreq, rreq; - MPI_Status status; - double tstart, tend; + int myself, tag = 0, c, t, slength, rlength; + MPI_Request requests[2]; + MPI_Status statuses[2]; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -309,74 +413,82 @@ static int irecv_isend_wait( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - MPI_Wait( &sreq, &status ); - MPI_Wait( &rreq, &status ); - /*MPI_Request_free( &sreq );*/ - /*MPI_Request_free( &rreq );*/ -#else - ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - ftmpi_wait( &sreq, &status ); - ftmpi_request_free( &sreq ); - ftmpi_request_free( &rreq ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[0] ); + MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[1] ); + MPI_Waitall( 2, requests, statuses ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers); return 0; } -static int do_test_for_ddt( MPI_Datatype sddt, MPI_Datatype rddt, int length ) +static int do_test_for_ddt( int doop, MPI_Datatype sddt, MPI_Datatype rddt, int length ) { - int i; MPI_Aint lb, extent; char *sbuf, *rbuf; + int i; MPI_Type_get_extent( sddt, &lb, &extent ); sbuf = (char*)malloc( length ); rbuf = (char*)malloc( length ); - printf( "# Isend recv (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - isend_recv( 10, sddt, i, sbuf, rddt, i, rbuf ); + if( doop & DO_PACK ) { + printf("# Pack (max length %d)\n", length); + for( i = 1; i <= (length/extent); i *= 2 ) { + pack( cycles, sddt, i, sbuf, rbuf ); + } } - printf( "# Isend Irecv Wait (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - isend_irecv_wait( 10, sddt, i, sbuf, rddt, i, rbuf ); + + if( doop & DO_UNPACK ) { + printf("# Unpack (length %d)\n", length); + for( i = 1; i <= (length/extent); i *= 2 ) { + unpack( cycles, sbuf, rddt, i, rbuf ); + } } - printf( "# Irecv send (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - irecv_send( 10, sddt, i, sbuf, rddt, i, rbuf ); + + if( doop & DO_ISEND_RECV ) { + printf( "# Isend recv (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + isend_recv( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } } - printf( "# Irecv Isend Wait (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - irecv_isend_wait( 10, sddt, i, sbuf, rddt, i, rbuf ); + + if( doop & DO_ISEND_IRECV ) { + printf( "# Isend Irecv Wait (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + isend_irecv_wait( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } + } + + if( doop & DO_IRECV_SEND ) { + printf( "# Irecv send (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + irecv_send( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } + } + + if( doop & DO_IRECV_SEND ) { + printf( "# Irecv Isend Wait (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + irecv_isend_wait( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } } free( sbuf ); free( rbuf ); return 0; } -#define DO_CONTIG 0x01 -#define DO_CONSTANT_GAP 0x02 -#define DO_INDEXED_GAP 0x04 -#define DO_OPTIMIZED_INDEXED_GAP 0x08 -#define DO_STRUCT_CONSTANT_GAP_RESIZED 0x10 - -#define MIN_LENGTH 1024 -#define MAX_LENGTH (1024*1024) - int main( int argc, char* argv[] ) { - int run_tests = 0xffffffff; /* do all tests by default */ - int length, rank, size; + int run_tests = 0xffff; /* do all datatype tests by default */ + int rank, size; MPI_Datatype ddt; - /*int run_tests = DO_CONSTANT_GAP;*/ + run_tests |= DO_PACK | DO_UNPACK; + MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); @@ -389,16 +501,14 @@ int main( int argc, char* argv[] ) if( run_tests & DO_CONTIG ) { printf( "\ncontiguous datatype\n\n" ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( MPI_INT, MPI_INT, length ); + do_test_for_ddt( run_tests, MPI_INT, MPI_INT, MAX_LENGTH ); } if( run_tests & DO_INDEXED_GAP ) { printf( "\nindexed gap\n\n" ); ddt = create_indexed_gap_ddt(); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -406,8 +516,7 @@ int main( int argc, char* argv[] ) printf( "\noptimized indexed gap\n\n" ); ddt = create_indexed_gap_optimized_ddt(); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -415,8 +524,7 @@ int main( int argc, char* argv[] ) printf( "\nconstant indexed gap\n\n" ); ddt = create_indexed_constant_gap_ddt( 80, 100, 1 ); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -424,8 +532,7 @@ int main( int argc, char* argv[] ) printf( "\noptimized constant indexed gap\n\n" ); ddt = create_optimized_indexed_constant_gap_ddt( 80, 100, 1 ); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -433,8 +540,7 @@ int main( int argc, char* argv[] ) printf( "\nstruct constant gap resized\n\n" ); ddt = create_struct_constant_gap_resized_ddt( 0 /* unused */, 0 /* unused */, 0 /* unused */ ); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } diff --git a/test/datatype/unpack_ooo.c b/test/datatype/unpack_ooo.c index 458ef550930..58ef8a95774 100644 --- a/test/datatype/unpack_ooo.c +++ b/test/datatype/unpack_ooo.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -18,7 +18,6 @@ #include "opal/runtime/opal.h" #include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_datatype_internal.h" -// #include #include #include #include @@ -61,6 +60,18 @@ static void print_bar_pbar(struct foo_t* bar, struct pfoo_t* pbar) fprintf(stderr, "\n"); } +static void print_stack(opal_convertor_t* conv) +{ + printf("Stack pos %d [converted %" PRIsize_t "/%" PRIsize_t "]\n", + conv->stack_pos, conv->bConverted, conv->local_size); + for( uint32_t i = 0; i <= conv->stack_pos; i++ ) { + printf( "[%u] index %d, type %s count %" PRIsize_t " disp %p\n", + i, conv->pStack[i].index, opal_datatype_basicDatatypes[conv->pStack[i].type]->name, + conv->pStack[i].count, (void*)conv->pStack[i].disp); + } + printf("\n"); +} + static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { int i, j, errors = 0; struct iovec a; @@ -104,6 +115,7 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { max_data = a.iov_len; pos = arr[i][1]; opal_convertor_set_position(pConv, &pos); + print_stack(pConv); assert(arr[i][1] == pos); opal_convertor_unpack( pConv, &a, &iov_count, &max_data ); a.iov_base = (char*)a.iov_base - 1024; @@ -118,9 +130,10 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { bar[j].d[1] != 0.0 || bar[j].d[2] != pbar[j].d[1]) { if(0 == errors) { - fprintf(stderr, "ERROR ! count=%d, position=%d, ptr = %p" + (void)opal_datatype_dump(&newtype->super); + fprintf(stderr, "ERROR ! position=%d/%d, ptr = %p" " got (%d,%d,%d,%g,%g,%g) expected (%d,%d,%d,%g,%g,%g)\n", - N, j, (void*)&bar[j], + j, N, (void*)&bar[j], bar[j].i[0], bar[j].i[1], bar[j].i[2], From 012a00480616cfd30c91de50635c0718d5cde72d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 20 May 2019 11:43:29 -0400 Subject: [PATCH 06/14] Clean and sync the pack and unpack functions. - optimize handling of contiguous with gaps datatypes. - fixes a performance issue for all datatypes with a count of 1. - optimize the pack/unpack of contiguous with gaps datatype. - optimize the case of blocklen == 1 Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_raw.c | 6 +- opal/datatype/opal_datatype_copy.h | 32 ++-- opal/datatype/opal_datatype_module.c | 1 - opal/datatype/opal_datatype_pack.c | 218 ++++++++++--------------- opal/datatype/opal_datatype_pack.h | 108 ++++++------ opal/datatype/opal_datatype_position.c | 60 ++++--- opal/datatype/opal_datatype_unpack.c | 128 +++++++-------- opal/datatype/opal_datatype_unpack.h | 112 +++++++------ 8 files changed, 314 insertions(+), 351 deletions(-) diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index df2340122a9..893792583f9 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -31,8 +31,8 @@ #endif /* OPAL_ENABLE_DEBUG */ /* Take a new iovec (base + len) and try to merge it with what we already - * have. If we succeed return 0 and move forward, if not save it into a new - * iovec location. If we need to go to a new position and we reach the end + * have. If we succeed return 0 and move forward, otherwise save it into a new + * iovec location. If we need to advance position and we reach the end * of the iovec array, return 1 to signal we did not saved the last iovec. */ static inline int @@ -46,7 +46,7 @@ opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count, return 0; } /* cannot merge, move to the next position */ *idx = *idx + 1; - if( *idx == *iov_count ) return 1; /* do not overwrite outside the iove array boundaries */ + if( *idx == *iov_count ) return 1; /* do not overwrite outside the iovec array boundaries */ } iov[*idx].iov_base = base; iov[*idx].iov_len = len; diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h index 40f119a684d..11058012e1e 100644 --- a/opal/datatype/opal_datatype_copy.h +++ b/opal/datatype/opal_datatype_copy.h @@ -51,11 +51,9 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, const ddt_elem_desc_t* _elem = &((ELEM)->elem); unsigned char* _source = (SOURCE) + _elem->disp; unsigned char* _destination = (DESTINATION) + _elem->disp; - size_t total_count = _elem->count * _elem->blocklen; - size_t do_now, do_now_bytes; + size_t do_now = _elem->count, do_now_bytes; - assert( (COUNT) == total_count); - assert( total_count <= ((*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size) ); + assert( (COUNT) == (do_now * _elem->blocklen)); /* We don't a prologue and epilogue here as we are __always__ working * with full copies of the data description. @@ -64,21 +62,19 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, /** * Compute how many full blocklen we need to do and do them. */ - do_now = _elem->count; - if( 0 != do_now ) { - do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; - for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) ); ); - MEM_OP( _destination, _source, do_now_bytes ); - _destination += _elem->extent; - _source += _elem->extent; - *(SPACE) -= do_now_bytes; - } - (COUNT) -= total_count; + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + assert( (do_now * do_now_bytes) <= (*SPACE) ); + + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", + STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); ); + MEM_OP( _destination, _source, do_now_bytes ); + _destination += _elem->extent; + _source += _elem->extent; } + *(SPACE) -= (do_now_bytes * do_now); } static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index 7976392b63e..d4415b21ef1 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -224,7 +224,6 @@ int32_t opal_datatype_init( void ) OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS; datatype->desc.desc[0].elem.common.type = i; - /* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */ datatype->desc.desc[0].elem.count = 1; datatype->desc.desc[0].elem.blocklen = 1; datatype->desc.desc[0].elem.disp = 0; diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index 55889fcaa55..cf69f6ada22 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -53,8 +53,6 @@ #endif /* defined(CHECKSUM) */ -#define IOVEC_MEM_LIMIT 8192 - /* the contig versions does not use the stack. They can easily retrieve * the status with just the informations from pConvertor->bConverted. */ @@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv, unsigned char *source_base = NULL; uint32_t iov_count; size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; - source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp); + source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp); /* There are some optimizations that can be done if the upper level * does not provide a buffer. @@ -111,155 +108,116 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv, uint32_t* out_size, size_t* max_data ) { + size_t remaining, length, initial_bytes_converted = pConv->bConverted; const opal_datatype_t* pData = pConv->pDesc; dt_stack_t* stack = pConv->pStack; + ptrdiff_t extent = pData->ub - pData->lb; unsigned char *user_memory, *packed_buffer; - uint32_t iov_count, index; + uint32_t idx; size_t i; - size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; - ptrdiff_t extent= pData->ub - pData->lb; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; + /* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb + * is the initial displacement, the size the length of the contiguous area and the extent represent + * how much we should jump between elements. + */ assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) ); DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", (void*)pConv->pBaseBuf, *out_size ); ); if( stack[1].type != opal_datatype_uint1.id ) { stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; - stack[1].type = opal_datatype_uint1.id; + stack[1].type = opal_datatype_uint1.id; + } + /* We can provide directly the pointers in the user buffers (like the convertor_raw) */ + if( NULL == iov[0].iov_base ) { + user_memory = pConv->pBaseBuf + pData->true_lb; + + for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) { + iov[idx].iov_base = user_memory + stack[0].disp + stack[1].disp; + iov[idx].iov_len = stack[1].count; + COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv ); + + pConv->bConverted += stack[1].count; + + stack[0].disp += extent; + stack[0].count--; + stack[1].disp = 0; + stack[1].count = pData->size; /* we might need this to update the partial + * length for the first iteration */ + } + goto update_status_and_return; } - /* There are some optimizations that can be done if the upper level - * does not provide a buffer. - */ - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { + for( idx = 0; idx < (*out_size); idx++ ) { /* Limit the amount of packed data to the data left over on this convertor */ remaining = pConv->local_size - pConv->bConverted; if( 0 == remaining ) break; /* we're done this time */ - if( remaining > iov[iov_count].iov_len ) - remaining = iov[iov_count].iov_len; - packed_buffer = (unsigned char *)iov[iov_count].iov_base; - bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp; - i = pConv->count - stack[0].count; /* how many we already packed */ - assert(i == (pConv->bConverted / pData->size)); - - if( packed_buffer == NULL ) { - /* special case for small data. We avoid allocating memory if we - * can fill the iovec directly with the address of the remaining - * data. - */ - if( stack->count < (size_t)((*out_size) - iov_count) ) { - stack[1].count = pData->size - (pConv->bConverted % pData->size); - for( index = iov_count; i < pConv->count; i++, index++ ) { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = stack[1].count; - stack[0].disp += extent; - pConv->bConverted += stack[1].count; - stack[1].disp = 0; /* reset it for the next round */ - stack[1].count = pData->size; - user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp; - COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); - } - *out_size = iov_count + index; - *max_data = (pConv->bConverted - initial_bytes_converted); - pConv->flags |= CONVERTOR_COMPLETED; - return 1; /* we're done */ - } - /* now special case for big contiguous data with gaps around */ - if( pData->size >= IOVEC_MEM_LIMIT ) { - /* as we dont have to copy any data, we can simply fill the iovecs - * with data from the user data description. - */ - for( index = iov_count; (i < pConv->count) && (index < (*out_size)); - i++, index++ ) { - if( remaining < pData->size ) { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = remaining; - remaining = 0; - COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); - break; - } else { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = pData->size; - user_memory += extent; - COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv ); - } - remaining -= iov[index].iov_len; - pConv->bConverted += iov[index].iov_len; - } - *out_size = index; - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + if( remaining > iov[idx].iov_len ) + remaining = iov[idx].iov_len; + packed_buffer = (unsigned char *)iov[idx].iov_base; + pConv->bConverted += remaining; + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; + + DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, remaining ); ); + + length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ + /* data left from last round and enough space in the buffer */ + if( (pData->size != length) && (length <= remaining)) { + /* copy the partial left-over from the previous round */ + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n", + (void*)user_memory, (void*)packed_buffer, length ); ); + MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); + packed_buffer += length; + remaining -= length; + stack[1].count -= length; + stack[1].disp += length; /* just in case, we overwrite this below */ + if( 0 == stack[1].count) { /* one completed element */ + stack[0].count--; + stack[0].disp += extent; + if( 0 == stack[0].count ) /* not yet done */ + break; + stack[1].count = pData->size; + stack[1].disp = 0; } + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; } - { - DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); - - length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ - /* data left from last round and enough space in the buffer */ - if( (0 != length) && (length <= remaining)) { - /* copy the partial left-over from the previous round */ - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); - MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); - packed_buffer += length; - user_memory += (extent - pData->size + length); - remaining -= length; - stack[1].count -= length; - if( 0 == stack[1].count) { /* one completed element */ - stack[0].count--; - stack[0].disp += extent; - if( 0 != stack[0].count ) { /* not yet done */ - stack[1].count = pData->size; - stack[1].disp = 0; - } - } - } - for( i = 0; pData->size <= remaining; i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); - MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); - packed_buffer += pData->size; - user_memory += extent; - remaining -= pData->size; - } - stack[0].count -= i; /* the filled up and the entire types */ - stack[0].disp += (i * extent); - stack[1].disp += remaining; - /* Copy the last bits */ - if( 0 != remaining ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); - MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); - user_memory += remaining; - stack[1].count -= remaining; - } + for( i = 0; pData->size <= remaining; i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); ); + MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); + packed_buffer += pData->size; + user_memory += extent; + remaining -= pData->size; + } + stack[0].count -= i; /* the entire datatype copied above */ + stack[0].disp += (i * extent); + + /* Copy the last bits */ + if( 0 != remaining ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, remaining ); ); + MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); + stack[1].count -= remaining; + stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */ if( 0 == stack[1].count ) { /* prepare for the next element */ stack[1].count = pData->size; stack[1].disp = 0; } } - pConv->bConverted += bConverted; } - *out_size = iov_count; - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + + update_status_and_return: + *out_size = idx; + *max_data = pConv->bConverted - initial_bytes_converted; + if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; + return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ } /* The pack/unpack functions need a cleanup. I have to create a proper interface to access diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 66259f8b66b..514f8bd7b02 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -35,82 +35,90 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, size_t* SPACE ) { const ddt_elem_desc_t* _elem = &((ELEM)->elem); - size_t total_count = _elem->count * _elem->blocklen; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t do_now, do_now_bytes; + size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; assert( *(COUNT) <= _elem->count * _elem->blocklen); if( cando_count > *(COUNT) ) cando_count = *(COUNT); - /** - * First check if we already did something on this element ? - */ - do_now = (total_count - *(COUNT)); /* done elements */ - if( 0 != do_now ) { - do_now = do_now % _elem->blocklen; /* partial blocklen? */ - - if( 0 != do_now ) { - size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ - do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); - _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; - cando_count -= do_now; + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ + *(COUNT) -= cando_count; + for(; cando_count > 0; cando_count--) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; } + goto update_and_return; } + blocklen_bytes *= _elem->blocklen; /** - * Compute how many full blocklen we need to do and do them. + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). */ - do_now = cando_count / _elem->blocklen; + do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; if( 0 != do_now ) { - do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; - for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); - MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); - *(packed) += do_now_bytes; - _memory += _elem->extent; - *(SPACE) -= do_now_bytes; - *(COUNT) -= _elem->blocklen; - cando_count -= _elem->blocklen; - } + size_t left_in_block = do_now; /* left in the current blocklen */ + do_now = (do_now > cando_count ) ? cando_count : do_now; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); + _memory += (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - blocklen_bytes; + _packed += do_now_bytes; + cando_count -= do_now; + } + + /* Do as many full blocklen as possible */ + for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; } /** * As an epilog do anything left from the last blocklen. */ - do_now = cando_count; - if( 0 != do_now ) { - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + if( 0 != cando_count ) { + assert( cando_count < _elem->blocklen ); + do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); + (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; + _packed += do_now_bytes; } + update_and_return: *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index 381a31086d6..f8137c7e0cb 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -49,10 +49,24 @@ * - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless. */ +static inline void +position_single_block(opal_convertor_t* CONVERTOR, + unsigned char** mem, ptrdiff_t mem_update, + size_t* space, size_t space_update, + size_t* cnt, size_t cnt_update) +{ + OPAL_DATATYPE_SAFEGUARD_POINTER( *mem, mem_update, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n", + (void*)*mem, (unsigned long)space_update, (unsigned long)(*space) ); ); + *mem += mem_update; + *space -= space_update; + *cnt -= cnt_update; +} + /** - * Advance the current position in the convertor based using the - * current element and a left-over counter. Update the head pointer - * and the leftover byte space. + * Advance the convertors' position according. Update the pointer and the remaining space + * accordingly. */ static inline void position_predefined_data( opal_convertor_t* CONVERTOR, @@ -64,7 +78,7 @@ position_predefined_data( opal_convertor_t* CONVERTOR, const ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t total_count = _elem->count * _elem->blocklen; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t do_now, do_now_bytes; + size_t do_now, do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; unsigned char* _memory = (*POINTER) + _elem->disp; assert( *(COUNT) <= _elem->count * _elem->blocklen); @@ -72,6 +86,15 @@ position_predefined_data( opal_convertor_t* CONVERTOR, if( cando_count > *(COUNT) ) cando_count = *(COUNT); + if( 1 == _elem->blocklen ) { + DO_DEBUG( opal_output( 0, "position( %p, %" PRIsize_t " ) x (count %" PRIsize_t ", extent %ld) => space %lu [prolog]\n", + (void*)_memory, (unsigned long)do_now_bytes, cando_count, _elem->extent, (unsigned long)(*SPACE) ); ); + _memory += cando_count * _elem->extent; + *SPACE -= cando_count * do_now_bytes; + *COUNT -= cando_count; + goto update_and_return; + } + /** * First check if we already did something on this element ? */ @@ -84,16 +107,12 @@ position_predefined_data( opal_convertor_t* CONVERTOR, do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - _memory = *(POINTER) + _elem->disp + (ptrdiff_t)do_now_bytes; + position_single_block( CONVERTOR, &_memory, do_now_bytes, + SPACE, do_now_bytes, COUNT, do_now ); + /* compensate if we just completed a blocklen */ if( do_now == left_in_block ) _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; cando_count -= do_now; } } @@ -105,13 +124,8 @@ position_predefined_data( opal_convertor_t* CONVERTOR, if( 0 != do_now ) { do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu\n", - (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); - _memory += _elem->extent; - *(SPACE) -= do_now_bytes; - *(COUNT) -= _elem->blocklen; + position_single_block( CONVERTOR, &_memory, _elem->extent, + SPACE, do_now_bytes, COUNT, _elem->blocklen ); cando_count -= _elem->blocklen; } } @@ -122,15 +136,11 @@ position_predefined_data( opal_convertor_t* CONVERTOR, do_now = cando_count; if( 0 != do_now ) { do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [epilog]\n", - (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - _memory += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; + position_single_block( CONVERTOR, &_memory, do_now_bytes, + SPACE, do_now_bytes, COUNT, do_now ); } + update_and_return: *(POINTER) = _memory - _elem->disp; } diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index 3edb9161923..ac35a03c267 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -70,98 +70,82 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, { const opal_datatype_t *pData = pConv->pDesc; unsigned char *user_memory, *packed_buffer; - uint32_t iov_count, i; - size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; + uint32_t iov_idx, i; + size_t remaining, initial_bytes_converted = pConv->bConverted; dt_stack_t* stack = pConv->pStack; ptrdiff_t extent = pData->ub - pData->lb; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", + DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov count %d )\n", (void*)pConv->pBaseBuf, *out_size ); ); if( stack[1].type != opal_datatype_uint1.id ) { stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].type = opal_datatype_uint1.id; } - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { - remaining = pConv->local_size - pConv->bConverted; - if( 0 == remaining ) break; /* we're done this time */ - if( remaining > iov[iov_count].iov_len ) - remaining = iov[iov_count].iov_len; - packed_buffer = (unsigned char*)iov[iov_count].iov_base; - bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + initial_displ; - - if( (ptrdiff_t)pData->size == extent ) { - user_memory += pConv->bConverted; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + + if( (ptrdiff_t)pData->size == extent ) { + for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) { + remaining = pConv->local_size - pConv->bConverted; + if( 0 == remaining ) break; /* we're done this time */ + if( remaining > iov[iov_idx].iov_len ) + remaining = iov[iov_idx].iov_len; + + packed_buffer = (unsigned char*)iov[iov_idx].iov_base; + user_memory = pConv->pBaseBuf + pData->true_lb + pConv->bConverted; /* contiguous data or basic datatype with count */ OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "1. unpack contig dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + DO_DEBUG( opal_output( 0, "unpack contig [%d] dest %p src %p length %" PRIsize_t "\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); - } else { - user_memory += stack[0].disp + stack[1].disp; + pConv->bConverted += remaining; /* how much will get unpacked this time */ + } + } else { + for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) { + remaining = pConv->local_size - pConv->bConverted; + if( 0 == remaining ) break; /* we're done this time */ + if( remaining > iov[iov_idx].iov_len ) + remaining = iov[iov_idx].iov_len; + + packed_buffer = (unsigned char*)iov[iov_idx].iov_base; + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; + pConv->bConverted += remaining; /* how much will get unpacked this time */ + + for( i = 0; stack[1].count <= remaining; i++ ) { /* partial or full data */ + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, stack[1].count, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [%d]\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, stack[1].count, i ); ); + MEMCPY_CSUM( user_memory, packed_buffer, stack[1].count, pConv ); - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + packed_buffer += stack[1].count; + remaining -= stack[1].count; - length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last unpack */ - /* complete the last copy */ - if( (0 != length) && (length <= remaining) ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); - MEMCPY_CSUM( user_memory, packed_buffer, length, pConv ); - packed_buffer += length; - user_memory += (extent - (pData->size - length)); - remaining -= length; - stack[1].count -= length; - if( 0 == stack[1].count) { /* one completed element */ - stack[0].count--; - stack[0].disp += extent; - if( 0 != stack[0].count ) { /* not yet done */ - stack[1].count = pData->size; - stack[1].disp = 0; - } - } - } - for( i = 0; pData->size <= remaining; i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); - MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv ); - packed_buffer += pData->size; - user_memory += extent; - remaining -= pData->size; + stack[0].count--; + stack[0].disp += extent; + stack[1].count = pData->size; + stack[1].disp = 0; + + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp; } - stack[0].count -= i; - stack[0].disp += (i * extent); - stack[1].disp += remaining; - /* copy the last bits */ + + /* Copy the last bits */ if( 0 != remaining ) { OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [epilog]\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); - user_memory += remaining; stack[1].count -= remaining; + stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */ + assert( stack[1].count ); } } - pConv->bConverted += bConverted; } - *out_size = iov_count; /* we only reach this line after the for loop succesfully complete */ - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + *out_size = iov_idx; /* we only reach this line after the for loop succesfully complete */ + *max_data = pConv->bConverted - initial_bytes_converted; + if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; + return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ } /** @@ -179,7 +163,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, static inline void opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem, unsigned char* partial_data, - ptrdiff_t start_position, ptrdiff_t length, + ptrdiff_t start_position, size_t length, unsigned char** user_buffer ) { char unused_byte = 0x7F, saved_data[16]; @@ -195,7 +179,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle /* Find a byte that is not used in the partial buffer */ find_unused_byte: - for(ptrdiff_t i = 0; i < length; i++ ) { + for(size_t i = 0; i < length; i++ ) { if( unused_byte == partial_data[i] ) { unused_byte--; goto find_unused_byte; @@ -306,7 +290,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, COMPUTE_CSUM( iov_ptr, missing_length, pConvertor ); opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, - pConvertor->partial_length, element_length - pConvertor->partial_length, + pConvertor->partial_length, (size_t)(element_length - pConvertor->partial_length), &conv_ptr ); --count_desc; if( 0 == count_desc ) { diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index f51a609294d..5a3679bc37f 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -35,82 +35,90 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, size_t* SPACE ) { const ddt_elem_desc_t* _elem = &((ELEM)->elem); - size_t total_count = _elem->count * _elem->blocklen; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t do_now, do_now_bytes; + size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; - assert( *(COUNT) <= _elem->count * _elem->blocklen); + assert( *(COUNT) <= (_elem->count * _elem->blocklen)); if( cando_count > *(COUNT) ) cando_count = *(COUNT); - /** - * First check if we already did something on this element ? - */ - do_now = (total_count - *(COUNT)); /* done elements */ - if( 0 != do_now ) { - do_now = do_now % _elem->blocklen; /* partial blocklen? */ - - if( 0 != do_now ) { - size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ - do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); - _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; - cando_count -= do_now; + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ + *(COUNT) -= cando_count; + for(; cando_count > 0; cando_count--) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; } + goto update_and_return; } + blocklen_bytes *= _elem->blocklen; /** - * Compute how many full blocklen we need to do and do them. + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). */ - do_now = cando_count / _elem->blocklen; + do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; if( 0 != do_now ) { - do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; - for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); - MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); - *(packed) += do_now_bytes; - _memory += _elem->extent; - *(SPACE) -= do_now_bytes; - *(COUNT) -= _elem->blocklen; - cando_count -= _elem->blocklen; - } + size_t left_in_block = do_now; /* left in the current blocklen */ + do_now = (do_now > cando_count ) ? cando_count : do_now; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); + _memory += (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - blocklen_bytes; + _packed += do_now_bytes; + cando_count -= do_now; + } + + /* Do as many full blocklen as possible */ + for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; } /** * As an epilog do anything left from the last blocklen. */ - do_now = cando_count; - if( 0 != do_now ) { - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + if( 0 != cando_count ) { + assert( cando_count < _elem->blocklen ); + do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; + _packed += do_now_bytes; } + update_and_return: *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, From 78cc0ff89193b0ec7034b4ea26f93aefb83e7d15 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 28 May 2019 14:54:40 -0400 Subject: [PATCH 07/14] Disable checksum. Signed-off-by: George Bosilca --- opal/datatype/opal_convertor.c | 16 +++++++++------- opal/datatype/opal_convertor.h | 6 ++++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 7a449302bff..4754723f68a 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -579,8 +579,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, assert(! (convertor->flags & CONVERTOR_SEND)); OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ); - if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { - if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) { +#if defined(CHECKSUM) + if( OPAL_UNLIKELY(convertor->flags & CONVERTOR_WITH_CHECKSUM) ) { + if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_unpack_general_checksum; } else { if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { @@ -589,8 +590,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_unpack_checksum; } } - } else { - if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) { + } else +#endif /* defined(CHECKSUM) */ + if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_unpack_general; } else { if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { @@ -599,7 +601,6 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_unpack; } } - } return OPAL_SUCCESS; } @@ -618,6 +619,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ); +#if defined(CHECKSUM) if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_pack_general_checksum; @@ -632,7 +634,8 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_pack_checksum; } } - } else { + } else +#endif /* defined(CHECKSUM) */ if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_pack_general; } else { @@ -646,7 +649,6 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_pack; } } - } return OPAL_SUCCESS; } diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h index 875c111b1f1..b24d94c37b0 100644 --- a/opal/datatype/opal_convertor.h +++ b/opal/datatype/opal_convertor.h @@ -332,8 +332,10 @@ opal_convertor_set_position( opal_convertor_t* convertor, /* Remove the completed flag if it's already set */ convertor->flags &= ~CONVERTOR_COMPLETED; - if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && - (convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) && + if( (convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) && +#if defined(CHECKSUM) + !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && +#endif /* defined(CHECKSUM) */ (convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { /* Contiguous and no checkpoint and no homogeneous unpack */ convertor->bConverted = *position; From d5cdfe70eff1371f69edf847bc1b164bd7e05d92 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 21 Jun 2019 13:15:12 -0400 Subject: [PATCH 08/14] Optimize the position placement. Upon detecting a datatype loop representation skip the entire loop according the the remaining space. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_position.c | 67 +++++++++----------------- 1 file changed, 23 insertions(+), 44 deletions(-) diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index f8137c7e0cb..204d670a3ef 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -123,11 +123,18 @@ position_predefined_data( opal_convertor_t* CONVERTOR, do_now = cando_count / _elem->blocklen; if( 0 != do_now ) { do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; +#if OPAL_ENABLE_DEBUG for(size_t _i = 0; _i < do_now; _i++ ) { position_single_block( CONVERTOR, &_memory, _elem->extent, SPACE, do_now_bytes, COUNT, _elem->blocklen ); cando_count -= _elem->blocklen; } +#else + _memory += do_now * _elem->extent; + *SPACE -= do_now * do_now_bytes; + *COUNT -= do_now * _elem->blocklen; + cando_count -= do_now * _elem->blocklen; +#endif /* OPAL_ENABLE_DEBUG */ } /** @@ -144,48 +151,16 @@ position_predefined_data( opal_convertor_t* CONVERTOR, *(POINTER) = _memory - _elem->disp; } -/** - * Advance the current position in the convertor based using the - * current contiguous loop and a left-over counter. Update the head - * pointer and the leftover byte space. - */ -static inline void -position_contiguous_loop( opal_convertor_t* CONVERTOR, - dt_elem_desc_t* ELEM, - size_t* COUNT, - unsigned char** POINTER, - size_t* SPACE ) -{ - ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); - ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + (ELEM)->loop.items); - size_t _copy_loops = *(COUNT); - - if( (_copy_loops * _end_loop->size) > *(SPACE) ) - _copy_loops = *(SPACE) / _end_loop->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( *(POINTER) + _end_loop->first_elem_disp, - (_copy_loops - 1) * _loop->extent + _end_loop->size, - (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - *(POINTER) += _copy_loops * _loop->extent; - *(SPACE) -= _copy_loops * _end_loop->size; - *(COUNT) -= _copy_loops; -} - -#define POSITION_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, POSITION, SPACE ) \ - position_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(POSITION), &(SPACE) ) - -#define POSITION_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, POSITION, SPACE ) \ - position_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(POSITION), &(SPACE) ) - int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, size_t* position ) { dt_stack_t* pStack; /* pointer to the position on the stack */ uint32_t pos_desc; /* actual position in the description of the derived datatype */ size_t count_desc; /* the number of items already done in the actual pos_desc */ + size_t iov_len_local; dt_elem_desc_t* description = pConvertor->use_desc->desc; dt_elem_desc_t* pElem; /* current position */ unsigned char *base_pointer = pConvertor->pBaseBuf; - size_t iov_len_local; ptrdiff_t extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb; DUMP( "opal_convertor_generic_simple_position( %p, &%ld )\n", (void*)pConvertor, (long)*position ); @@ -236,21 +211,19 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, assert(pConvertor->partial_length < element_length); return 0; } - pConvertor->partial_length = (pConvertor->partial_length + missing_length) % element_length; - assert(pConvertor->partial_length == 0); + pConvertor->partial_length = 0; pConvertor->bConverted += missing_length; iov_len_local -= missing_length; count_desc--; } while( 1 ) { - if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ + if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the the entire datatype */ DO_DEBUG( opal_output( 0, "position end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %lx space %lu\n", pStack->count, pConvertor->stack_pos, pos_desc, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( pConvertor->stack_pos == 0 ) { pConvertor->flags |= CONVERTOR_COMPLETED; - pConvertor->partial_length = 0; goto complete_loop; /* completed */ } pConvertor->stack_pos--; @@ -259,11 +232,13 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, } else { if( pStack->index == -1 ) { pStack->disp += extent; + pos_desc = 0; /* back to the first element */ } else { assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); pStack->disp += description[pStack->index].loop.extent; + pos_desc = pStack->index; /* go back to the loop start itself to give a chance + * to move forward by entire loops */ } - pos_desc = pStack->index + 1; } base_pointer = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); @@ -273,9 +248,14 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, } if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { ptrdiff_t local_disp = (ptrdiff_t)base_pointer; - if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - POSITION_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, - base_pointer, iov_len_local ); + ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)(pElem + pElem->loop.items); + size_t full_loops = iov_len_local / end_loop->size; + full_loops = count_desc <= full_loops ? count_desc : full_loops; + if( full_loops ) { + base_pointer += full_loops * pElem->loop.extent; + iov_len_local -= full_loops * end_loop->size; + count_desc -= full_loops; + if( 0 == count_desc ) { /* completed */ pos_desc += pElem->loop.items + 1; goto update_loop_description; @@ -297,8 +277,7 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, } while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ - POSITION_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - base_pointer, iov_len_local ); + position_predefined_data( pConvertor, pElem, &count_desc, &base_pointer, &iov_len_local ); if( 0 != count_desc ) { /* completed */ pConvertor->partial_length = iov_len_local; goto complete_loop; From fad707d3b05f7e4baef3c55c1dbfaf4537b348c9 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 26 Jun 2019 12:55:44 -0400 Subject: [PATCH 09/14] Rework the datatype commit. Optimize contiguous loops by collapsing them into a single element. During datatype optimization collapse similar elements into larger blocks. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_internal.h | 12 ++++-- opal/datatype/opal_datatype_optimize.c | 60 ++++++++++++++++++-------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index 2b2ddc0961e..1f10c9138aa 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -215,10 +215,8 @@ union dt_elem_desc { /** - * Create one or more elements depending on the value of _count. If the value - * is too large for the type of elem.count then use oth the elem.count and - * elem.blocklen to create it. If the number is prime then create a second - * element to account for the difference. + * Create an element entry in the description. If the element is contiguous + * collapse everything into the blocklen. */ #define CREATE_ELEM(_place, _type, _flags, _blocklen, _count, _disp, _extent) \ do { \ @@ -228,6 +226,12 @@ union dt_elem_desc { (_place)->elem.count = (_count); \ (_place)->elem.extent = (_extent); \ (_place)->elem.disp = (_disp); \ + if( _extent == (ptrdiff_t)(_blocklen * opal_datatype_basicDatatypes[_type]->size) ) { \ + /* collapse it into a single large blocklen */ \ + (_place)->elem.blocklen *= _count; \ + (_place)->elem.extent *= _count; \ + (_place)->elem.count = 1; \ + } \ } while(0) /* * This array holds the descriptions desc.desc[2] of the predefined basic datatypes. diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c index fbaacb592c2..336e11f0560 100644 --- a/opal/datatype/opal_datatype_optimize.c +++ b/opal/datatype/opal_datatype_optimize.c @@ -60,27 +60,27 @@ opal_datatype_optimize_short( opal_datatype_t* pData, CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; - last.disp += last.count; last.count= 0; } CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */ end_loop->first_elem_disp, end_loop->size, end_loop->common.flags ); - pElemDesc++; nbElems++; if( --stack_pos >= 0 ) { /* still something to do ? */ ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop); - pStartLoop->items = end_loop->items; + pStartLoop->items = pElemDesc->end_loop.items; total_disp = pStack->disp; /* update the displacement position */ } + pElemDesc++; nbElems++; pStack--; /* go down one position on the stack */ pos_desc++; continue; } if( OPAL_DATATYPE_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { ddt_loop_desc_t* loop = (ddt_loop_desc_t*)&(pData->desc.desc[pos_desc]); - ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]); int index = GET_FIRST_NON_LOOP( &(pData->desc.desc[pos_desc]) ); if( loop->common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { + ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]); + assert(pData->desc.desc[pos_desc + index].elem.disp == end_loop->first_elem_disp); compress.common.flags = loop->common.flags; compress.common.type = pData->desc.desc[pos_desc + index].elem.common.type; @@ -99,7 +99,12 @@ opal_datatype_optimize_short( opal_datatype_t* pData, compress.count = loop->loops; compress.extent = loop->extent; compress.disp = end_loop->first_elem_disp; - + if( compress.extent == (ptrdiff_t)(compress.blocklen * opal_datatype_basicDatatypes[compress.common.type]->size) ) { + /* The compressed element is contiguous: collapse it into a single large blocklen */ + compress.blocklen *= compress.count; + compress.extent *= compress.count; + compress.count = 1; + } /** * The current loop has been compressed and can now be treated as if it * was a data element. We can now look if it can be fused with last, @@ -161,26 +166,43 @@ opal_datatype_optimize_short( opal_datatype_t* pData, } /* are the two elements compatible: aka they have very similar values and they - * can be merged together by increasing the count. This optimizes the memory - * required for storing the datatype description. + * can be merged together by increasing the count, and/or changing the extent. */ - if( ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == - (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)) && - (current->disp == (last.disp + (ptrdiff_t)last.count * last.extent)) && - ((current->count == 1) || (last.extent == current->extent)) ) { - last.count += current->count; - /* find the lowest common denomitaor type */ + if( (last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == + (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size) ) { + ddt_elem_desc_t save = last; /* safekeep the type and blocklen */ if( last.common.type != current->common.type ) { last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; last.common.type = OPAL_DATATYPE_UINT1; } - /* maximize the contiguous pieces */ - if( last.extent == (ptrdiff_t)(last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) ) { - last.blocklen *= last.count; - last.count = 1; - last.extent = last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size; + + if( 1 == last.count ) { + /* we can ignore the extent of the element with count == 1 and merge them together if their displacements match */ + if( 1 == current->count ) { + last.extent = current->disp - last.disp; + last.count++; + continue; + } + /* can we compute a matching displacement ? */ + if( (last.disp + current->extent) == current->disp ) { + last.extent = current->extent; + last.count = current->count + 1; + continue; + } } - continue; /* next data */ + if( (last.extent * (ptrdiff_t)last.count + last.disp) == current->disp ) { + if( 1 == current->count ) { + last.count++; + continue; + } + if( last.extent == current->extent ) { + last.count += current->count; + continue; + } + } + last.blocklen = save.blocklen; + last.common.type = save.common.type; + /* try other optimizations */ } /* are the elements fusionable such that we can fusion the last blocklen of one with the first * blocklen of the other. From 87299e0b1c3a14b3ca70799fc0be12ef98ed7bcd Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 10 Jul 2019 00:28:29 -0400 Subject: [PATCH 10/14] Get rid of the division in the critical path. Amazing how a bad instruction scheduling can have such a drastic impact on the code performance. With this change, the get a boost of at least 50% on the performance of data with a small blocklen and/or count. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_pack.h | 28 +++++++++++++++++++--------- opal/datatype/opal_datatype_unpack.h | 27 +++++++++++++++++++-------- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 514f8bd7b02..4da9bd2450e 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -35,19 +35,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, size_t* SPACE ) { const ddt_elem_desc_t* _elem = &((ELEM)->elem); - size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t do_now, do_now_bytes; size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t cando_count = *(COUNT), do_now, do_now_bytes; unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _packed = *packed; assert( *(COUNT) <= _elem->count * _elem->blocklen); - if( cando_count > *(COUNT) ) - cando_count = *(COUNT); + if( (blocklen_bytes * cando_count) > *(SPACE) ) + cando_count = (*SPACE) / blocklen_bytes; + do_now = *(COUNT); /* save the COUNT for later */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */ + goto do_epilog; + } if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ - *(COUNT) -= cando_count; for(; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); @@ -59,17 +64,19 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, } goto update_and_return; } - blocklen_bytes *= _elem->blocklen; + blocklen_bytes *= _elem->blocklen; + if( (_elem->count * _elem->blocklen) == cando_count ) { + goto skip_prolog; + } /** * First check if we already did something on this element ? The COUNT is the number * of remaining predefined types in the current elem, not how many predefined types * should be manipulated in the current call (this number is instead reflected on the * SPACE). */ - do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */ - /* premptively update the number of COUNT we will return. */ - *(COUNT) -= cando_count; + do_now = do_now % _elem->blocklen; /* any partial elements ? */ + if( 0 != do_now ) { size_t left_in_block = do_now; /* left in the current blocklen */ do_now = (do_now > cando_count ) ? cando_count : do_now; @@ -88,6 +95,7 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, cando_count -= do_now; } + skip_prolog: /* Do as many full blocklen as possible */ for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, @@ -104,6 +112,8 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, * As an epilog do anything left from the last blocklen. */ if( 0 != cando_count ) { + + do_epilog: assert( cando_count < _elem->blocklen ); do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index 5a3679bc37f..49a418ba2b3 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -35,19 +35,24 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, size_t* SPACE ) { const ddt_elem_desc_t* _elem = &((ELEM)->elem); - size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t do_now, do_now_bytes; size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t cando_count = (*COUNT), do_now, do_now_bytes; unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _packed = *packed; assert( *(COUNT) <= (_elem->count * _elem->blocklen)); - if( cando_count > *(COUNT) ) - cando_count = *(COUNT); + if( (blocklen_bytes * cando_count) > *(SPACE) ) + cando_count = (*SPACE) / blocklen_bytes; + do_now = *(COUNT); /* save the COUNT for later */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */ + goto do_epilog; + } if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ - *(COUNT) -= cando_count; for(; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); @@ -59,7 +64,11 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, } goto update_and_return; } + blocklen_bytes *= _elem->blocklen; + if( (_elem->count * _elem->blocklen) == cando_count ) { + goto skip_prolog; + } /** * First check if we already did something on this element ? The COUNT is the number @@ -67,9 +76,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, * should be manipulated in the current call (this number is instead reflected on the * SPACE). */ - do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */ - /* premptively update the number of COUNT we will return. */ - *(COUNT) -= cando_count; + do_now = do_now % _elem->blocklen; /* any partial elements ? */ + if( 0 != do_now ) { size_t left_in_block = do_now; /* left in the current blocklen */ do_now = (do_now > cando_count ) ? cando_count : do_now; @@ -88,6 +96,7 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, cando_count -= do_now; } + skip_prolog: /* Do as many full blocklen as possible */ for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, @@ -104,6 +113,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, * As an epilog do anything left from the last blocklen. */ if( 0 != cando_count ) { + + do_epilog: assert( cando_count < _elem->blocklen ); do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, From f78d3d52cd32846fab0cceeb624a1f51caaa9fca Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 10 Jul 2019 11:30:59 -0400 Subject: [PATCH 11/14] Optimize the pack/unpack. Start optimizing the code. This commit divides the operations in 2 parts, the first, outside the critical part, deals with partial blocks of predefined elements, and the second, inside the critical path, only deals with full blocks of elements. This reduces the number of expensive operations in the critical path and results in a decent performance increase. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_pack.c | 30 ++++-- opal/datatype/opal_datatype_pack.h | 135 ++++++++++++++++---------- opal/datatype/opal_datatype_unpack.c | 82 +++++++++------- opal/datatype/opal_datatype_unpack.h | 140 +++++++++++++++++---------- 4 files changed, 238 insertions(+), 149 deletions(-) diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index cf69f6ada22..c0ab6df66d8 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -272,18 +272,32 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - /* now here we have a basic datatype */ - PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - conv_ptr, iov_ptr, iov_len_local ); - if( 0 == count_desc ) { /* completed */ + + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + /* we have a partial (less than blocklen) basic datatype */ + int rc = PACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + if( 0 == rc ) /* not done */ + goto complete_loop; + if( 0 == count_desc ) { conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - continue; } - goto complete_loop; + } + } + + while( 1 ) { + while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + /* we have a basic datatype (working on full blocks) */ + PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + if( 0 != count_desc ) /* completed? */ + goto complete_loop; + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d" diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 4da9bd2450e..1eaf2e8b9f9 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -26,6 +26,63 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif +/** + * This function deals only with partial elements. The COUNT points however to the whole leftover count, + * but this function is only expected to operate on an amount less than blength, that would allow the rest + * of the pack process to handle only entire blength blocks (plus the left over). + * + * Return 1 if we are now aligned on a block, 0 otherwise. + */ +static inline int +pack_partial_blocklen( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** memory, + unsigned char** packed, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now = *(COUNT); + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + if( 0 == (do_now = (*COUNT) % _elem->blocklen) ) + return 1; + + size_t left_in_block = do_now; /* left in the current blocklen */ + + if( (do_now_bytes * do_now) > *(SPACE) ) + do_now = (*SPACE) / do_now_bytes; + + do_now_bytes *= do_now; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [partial]\n", + _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); + *(memory) += (ptrdiff_t)do_now_bytes; + if( do_now == left_in_block ) /* compensate if completed a blocklen */ + *(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + + *(COUNT) -= do_now; + *(SPACE) -= do_now_bytes; + *(packed) += do_now_bytes; + return (do_now == left_in_block); +} + +/** + * Pack entire blocks, plus a possible remainder if SPACE is constrained to less than COUNT elements. + */ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, @@ -36,27 +93,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, { const ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t cando_count = *(COUNT), do_now, do_now_bytes; + size_t cando_count = *(COUNT), do_now_bytes; unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _packed = *packed; + assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */ assert( *(COUNT) <= _elem->count * _elem->blocklen); if( (blocklen_bytes * cando_count) > *(SPACE) ) cando_count = (*SPACE) / blocklen_bytes; - do_now = *(COUNT); /* save the COUNT for later */ /* premptively update the number of COUNT we will return. */ *(COUNT) -= cando_count; - if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */ - goto do_epilog; - } if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ for(; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); _packed += blocklen_bytes; @@ -65,61 +119,32 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, goto update_and_return; } - blocklen_bytes *= _elem->blocklen; - if( (_elem->count * _elem->blocklen) == cando_count ) { - goto skip_prolog; - } - /** - * First check if we already did something on this element ? The COUNT is the number - * of remaining predefined types in the current elem, not how many predefined types - * should be manipulated in the current call (this number is instead reflected on the - * SPACE). - */ - do_now = do_now % _elem->blocklen; /* any partial elements ? */ + if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) { + blocklen_bytes *= _elem->blocklen; - if( 0 != do_now ) { - size_t left_in_block = do_now; /* left in the current blocklen */ - do_now = (do_now > cando_count ) ? cando_count : do_now; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); - _memory += (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - blocklen_bytes; - _packed += do_now_bytes; - cando_count -= do_now; - } - - skip_prolog: - /* Do as many full blocklen as possible */ - for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); - MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); - _packed += blocklen_bytes; - _memory += _elem->extent; - cando_count -= _elem->blocklen; + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); } /** * As an epilog do anything left from the last blocklen. */ if( 0 != cando_count ) { - - do_epilog: - assert( cando_count < _elem->blocklen ); + assert( (cando_count < _elem->blocklen) || + ((1 == _elem->count) && (cando_count <= _elem->blocklen)) ); do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; _packed += do_now_bytes; @@ -159,7 +184,15 @@ static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, *(COUNT) -= _copy_loops; } -#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ +#define PACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + MEMORY, /* the source pointer (char*) */ \ + PACKED, /* the destination pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +pack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) + +#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ ELEM, /* the basic element to be packed */ \ COUNT, /* the number of elements */ \ MEMORY, /* the source pointer (char*) */ \ diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index ac35a03c267..dca07796d99 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -282,6 +282,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; + if( 0 != pConvertor->partial_length ) { size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; size_t missing_length = element_length - pConvertor->partial_length; @@ -302,34 +303,31 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, iov_len_local -= missing_length; pConvertor->partial_length = 0; /* nothing more inside */ } - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - /* now here we have a basic datatype */ - UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - iov_ptr, conv_ptr, iov_len_local ); - if( 0 == count_desc ) { /* completed */ + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + /* we have a partial (less than blocklen) basic datatype */ + int rc = UNPACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, + iov_ptr, conv_ptr, iov_len_local ); + if( 0 == rc ) /* not done */ + goto complete_loop; + if( 0 == count_desc ) { conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - continue; - } - assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { - unsigned char* temp = conv_ptr; - /* We have some partial data here. Let's copy it into the convertor - * and keep it hot until the next round. - */ - assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); - COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); - - opal_unpack_partial_datatype( pConvertor, pElem, - iov_ptr, 0, iov_len_local, - &temp ); - - pConvertor->partial_length = iov_len_local; - iov_len_local = 0; } - goto complete_loop; + } + } + + while( 1 ) { + while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + /* we have a basic datatype (working on full blocks) */ + UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + iov_ptr, conv_ptr, iov_len_local ); + if( 0 != count_desc ) /* completed? */ + goto complete_loop; + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n", @@ -337,11 +335,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( 0 == pConvertor->stack_pos ) { - /* Do the same thing as when the loop is completed */ - iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ - total_unpacked += iov[iov_count].iov_len; - iov_count++; /* go to the next */ - goto complete_conversion; + /* we're done. Force the exit of the main for loop (around iovec) */ + *out_size = iov_count; + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -380,14 +376,29 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, conv_ptr = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); - continue; } } complete_loop: + assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); + if( 0 != iov_len_local ) { + unsigned char* temp = conv_ptr; + /* We have some partial data here. Let's copy it into the convertor + * and keep it hot until the next round. + */ + assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); + COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); + + opal_unpack_partial_datatype( pConvertor, pElem, + iov_ptr, 0, iov_len_local, + &temp ); + + pConvertor->partial_length = iov_len_local; + iov_len_local = 0; + } + iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ total_unpacked += iov[iov_count].iov_len; } - complete_conversion: *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; @@ -514,11 +525,9 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( 0 == pConvertor->stack_pos ) { - /* Do the same thing as when the loop is completed */ - iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ - total_unpacked += iov[iov_count].iov_len; - iov_count++; /* go to the next */ - goto complete_conversion; + /* we're done. Force the exit of the main for loop (around iovec) */ + *out_size = iov_count; + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -552,7 +561,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ total_unpacked += iov[iov_count].iov_len; } - complete_conversion: *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index 49a418ba2b3..db5b58fd3c3 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -26,6 +26,60 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif +/** + * This function deals only with partial elements. The COUNT points however to the whole leftover count, + * but this function is only expected to operate on an amount less than blength, that would allow the rest + * of the pack process to handle only entire blength blocks (plus the left over). + * + * Return 1 if we are now aligned on a block, 0 otherwise. + */ +static inline int +unpack_partial_blocklen( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** packed, + unsigned char** memory, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now = (*COUNT); + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( *(COUNT) <= (_elem->count * _elem->blocklen)); + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + if( 0 == (do_now = (*COUNT) % _elem->blocklen) ) + return 1; + + size_t left_in_block = do_now; /* left in the current blocklen */ + + if( (do_now_bytes * do_now) > *(SPACE) ) + do_now = (*SPACE) / do_now_bytes; + + do_now_bytes *= do_now; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); + *(memory) += (ptrdiff_t)do_now_bytes; + if( do_now == left_in_block ) /* compensate if completed a blocklen */ + *(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + + *(COUNT) -= do_now; + *(SPACE) -= do_now_bytes; + *(packed) += do_now_bytes; + return (do_now == left_in_block); +} + static inline void unpack_predefined_data( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, @@ -36,27 +90,24 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, { const ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t cando_count = (*COUNT), do_now, do_now_bytes; + size_t cando_count = (*COUNT), do_now_bytes; unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _packed = *packed; + assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */ assert( *(COUNT) <= (_elem->count * _elem->blocklen)); if( (blocklen_bytes * cando_count) > *(SPACE) ) cando_count = (*SPACE) / blocklen_bytes; - do_now = *(COUNT); /* save the COUNT for later */ /* premptively update the number of COUNT we will return. */ *(COUNT) -= cando_count; - - if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */ - goto do_epilog; - } + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ for(; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); _packed += blocklen_bytes; @@ -65,57 +116,27 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, goto update_and_return; } - blocklen_bytes *= _elem->blocklen; - if( (_elem->count * _elem->blocklen) == cando_count ) { - goto skip_prolog; - } - - /** - * First check if we already did something on this element ? The COUNT is the number - * of remaining predefined types in the current elem, not how many predefined types - * should be manipulated in the current call (this number is instead reflected on the - * SPACE). - */ - do_now = do_now % _elem->blocklen; /* any partial elements ? */ - - if( 0 != do_now ) { - size_t left_in_block = do_now; /* left in the current blocklen */ - do_now = (do_now > cando_count ) ? cando_count : do_now; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); - _memory += (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - blocklen_bytes; - _packed += do_now_bytes; - cando_count -= do_now; - } + if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) { + blocklen_bytes *= _elem->blocklen; - skip_prolog: - /* Do as many full blocklen as possible */ - for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); - MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); - _packed += blocklen_bytes; - _memory += _elem->extent; - cando_count -= _elem->blocklen; + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); } /** * As an epilog do anything left from the last blocklen. */ if( 0 != cando_count ) { - - do_epilog: - assert( cando_count < _elem->blocklen ); + assert( (cando_count < _elem->blocklen) || + ((1 == _elem->count) && (cando_count <= _elem->blocklen)) ); do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); @@ -160,8 +181,21 @@ static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, *(COUNT) -= _copy_loops; } -#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ - unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) +#define UNPACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) + +#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) #define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) From 83d40c1e14c370bea685314988371856c9ad57ae Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 14 Aug 2019 01:05:28 -0400 Subject: [PATCH 12/14] Fix the stack displacement. Fixes the convertor iovec description on the MPI-IO reported by Edgar. Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_raw.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 893792583f9..3d22cd792a3 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -126,8 +126,8 @@ opal_convertor_raw( opal_convertor_t* pConvertor, const ddt_elem_desc_t* current = &(pElem->elem); if( count_desc != (current->count * current->blocklen) ) { /* Not the full element description */ - do_now = current->blocklen - (count_desc % current->blocklen); /* how much left in the block */ - if( do_now ) { + if( (do_now = count_desc % current->blocklen) ) { + do_now = current->blocklen - do_now; /* how much left in the block */ source_base += current->disp; blength = do_now * opal_datatype_basicDatatypes[current->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, @@ -136,12 +136,12 @@ opal_convertor_raw( opal_convertor_t* pConvertor, index, (void*)source_base, blength ); ); opal_convertor_merge_iov( iov, iov_count, (IOVBASE_TYPE *) source_base, blength, &index ); - /* not check the return value, we know there was at least one element in the iovec */ + /* ignore the return value, we know there was at least one element in the iovec */ sum_iov_len += blength; count_desc -= do_now; - source_base += (current->extent - current->disp + - (current->blocklen - do_now) * opal_datatype_basicDatatypes[current->common.type]->size); + source_base += (blength - current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size + + current->extent - current->disp); } } } @@ -258,7 +258,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, } /* I complete an element, next step I should go to the next one */ PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc, - source_base - pStack->disp - pConvertor->pBaseBuf ); + source_base - pConvertor->pBaseBuf ); DO_DEBUG( opal_output( 0, "raw save stack stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n", pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); ); return 0; From 8e6e826b54725ded4f15f9677ff0a11f80ee3e6c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 14 Aug 2019 10:59:50 -0400 Subject: [PATCH 13/14] Fix the variable names used for the datatype dump. Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_internal.h | 5 --- opal/datatype/opal_convertor_raw.c | 2 +- opal/datatype/opal_datatype_copy.c | 2 +- opal/datatype/opal_datatype_internal.h | 9 +++-- opal/datatype/opal_datatype_module.c | 51 +++++++++++++++---------- opal/datatype/opal_datatype_pack.c | 4 +- opal/datatype/opal_datatype_position.c | 2 +- opal/datatype/opal_datatype_unpack.c | 2 +- 8 files changed, 41 insertions(+), 36 deletions(-) diff --git a/opal/datatype/opal_convertor_internal.h b/opal/datatype/opal_convertor_internal.h index 025633cb7e7..39690f5bd19 100644 --- a/opal/datatype/opal_convertor_internal.h +++ b/opal/datatype/opal_convertor_internal.h @@ -50,11 +50,6 @@ opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_a void opal_convertor_destroy_masters( void ); -#if OPAL_ENABLE_DEBUG -extern bool opal_pack_debug; -extern bool opal_unpack_debug; -#endif /* OPAL_ENABLE_DEBUG */ - END_C_DECLS #endif /* OPAL_CONVERTOR_INTERNAL_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 3d22cd792a3..5bea5dcf5b8 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -25,7 +25,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_pack_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_raw_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index 7bf94ef97b9..c70bdd24dfa 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -36,7 +36,7 @@ #if OPAL_ENABLE_DEBUG -#define DO_DEBUG(INST) if( opal_copy_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_copy_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index 1f10c9138aa..bdeb0cc429e 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -496,10 +496,11 @@ OPAL_DECLSPEC int opal_datatype_contain_basic_datatypes( const struct opal_datat OPAL_DECLSPEC int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_desc( union dt_elem_desc* pDesc, int nbElems, char* ptr, size_t length ); -extern bool opal_position_debug; -extern bool opal_copy_debug; -extern bool opal_unpack_debug; -extern bool opal_pack_debug; +extern bool opal_ddt_position_debug; +extern bool opal_ddt_copy_debug; +extern bool opal_ddt_unpack_debug; +extern bool opal_ddt_pack_debug; +extern bool opal_ddt_raw_debug; END_C_DECLS #endif /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index d4415b21ef1..ba933b5fe2b 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -37,10 +37,11 @@ /* by default the debuging is turned off */ int opal_datatype_dfd = -1; -bool opal_unpack_debug = false; -bool opal_pack_debug = false; -bool opal_position_debug = false; -bool opal_copy_debug = false; +bool opal_ddt_unpack_debug = false; +bool opal_ddt_pack_debug = false; +bool opal_ddt_position_debug = false; +bool opal_ddt_copy_debug = false; +bool opal_ddt_raw_debug = false; int opal_ddt_verbose = -1; /* Has the datatype verbose it's own output stream */ extern int opal_cuda_verbose; @@ -148,35 +149,43 @@ int opal_datatype_register_params(void) int ret; ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_unpack_debug", - "Whether to output debugging information in the ddt unpack functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_unpack_debug); + "Whether to output debugging information in the ddt unpack functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_unpack_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_pack_debug", - "Whether to output debugging information in the ddt pack functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_pack_debug); + "Whether to output debugging information in the ddt pack functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_pack_debug); if (0 > ret) { - return ret; + return ret; + } + + ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_raw_debug", + "Whether to output debugging information in the ddt raw functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_raw_debug); + if (0 > ret) { + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_position_debug", - "Non zero lead to output generated by the datatype position functions", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_position_debug); + "Non zero lead to output generated by the datatype position functions", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_position_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_copy_debug", - "Whether to output debugging information in the ddt copy functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_copy_debug); + "Whether to output debugging information in the ddt copy functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_copy_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "opal", NULL, "ddt_verbose", @@ -195,7 +204,7 @@ int opal_datatype_register_params(void) OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &opal_cuda_verbose); if (0 > ret) { - return ret; + return ret; } #endif diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index c0ab6df66d8..f21adcccb34 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -31,7 +31,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_pack_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_pack_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ @@ -272,7 +272,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; - + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { /* we have a partial (less than blocklen) basic datatype */ diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index 204d670a3ef..02ec55651a0 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -33,7 +33,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_position_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_position_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index dca07796d99..0925bde736d 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -33,7 +33,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_unpack_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_unpack_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ From c9f48e2e77dbf4928b2d6c18f2576557c112cc1c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 14 Aug 2019 11:06:47 -0400 Subject: [PATCH 14/14] Whitespace cleanup No code or logic changes. Signed-off-by: George Bosilca Signed-off-by: Jeff Squyres --- ompi/mca/common/monitoring/common_monitoring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ompi/mca/common/monitoring/common_monitoring.c b/ompi/mca/common/monitoring/common_monitoring.c index e521ca56417..ff252bf944f 100644 --- a/ompi/mca/common/monitoring/common_monitoring.c +++ b/ompi/mca/common/monitoring/common_monitoring.c @@ -268,7 +268,7 @@ void mca_common_monitoring_register(void*pml_monitoring_component) &mca_common_monitoring_enabled); mca_common_monitoring_current_state = mca_common_monitoring_enabled; - + (void)mca_base_var_register("ompi", "pml", "monitoring", "enable_output", "Enable the PML monitoring textual output at MPI_Finalize " "(it will be automatically turned off when MPIT is used to " @@ -278,7 +278,7 @@ void mca_common_monitoring_register(void*pml_monitoring_component) MCA_BASE_VAR_FLAG_DWG, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_common_monitoring_output_enabled); - + (void)mca_base_var_register("ompi", "pml", "monitoring", "filename", /*&mca_common_monitoring_component.pmlm_version, "filename",*/ "The name of the file where the monitoring information " @@ -292,7 +292,7 @@ void mca_common_monitoring_register(void*pml_monitoring_component) /* Now that the MCA variables are automatically unregistered when * their component close, we need to keep a safe copy of the - * filename. + * filename. * Keep the copy completely separated in order to let the initial * filename to be handled by the framework. It's easier to deal * with the string lifetime.