diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h index 8b48bc30973..f589c874b64 100644 --- a/ompi/datatype/ompi_datatype.h +++ b/ompi/datatype/ompi_datatype.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2009-2013 The University of Tennessee and The University + * Copyright (c) 2009-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. diff --git a/ompi/datatype/ompi_datatype_create_contiguous.c b/ompi/datatype/ompi_datatype_create_contiguous.c index fb44673ef5c..6a287caa41c 100644 --- a/ompi/datatype/ompi_datatype_create_contiguous.c +++ b/ompi/datatype/ompi_datatype_create_contiguous.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -29,13 +29,12 @@ int32_t ompi_datatype_create_contiguous( int count, const ompi_datatype_t* oldTy { ompi_datatype_t* pdt; - if( 0 == count ) { - pdt = ompi_datatype_create( 0 ); - ompi_datatype_add( pdt, &ompi_mpi_datatype_null.dt, 0, 0, 0 ); - } else { - pdt = ompi_datatype_create( oldType->super.desc.used + 2 ); - opal_datatype_add( &(pdt->super), &(oldType->super), count, 0, (oldType->super.ub - oldType->super.lb) ); + if( (0 == count) || (0 == oldType->super.size) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } + + pdt = ompi_datatype_create( oldType->super.desc.used + 2 ); + opal_datatype_add( &(pdt->super), &(oldType->super), count, 0, (oldType->super.ub - oldType->super.lb) ); *newType = pdt; return OMPI_SUCCESS; } diff --git a/ompi/datatype/ompi_datatype_create_darray.c b/ompi/datatype/ompi_datatype_create_darray.c index a245dcebce4..e0292755c4b 100644 --- a/ompi/datatype/ompi_datatype_create_darray.c +++ b/ompi/datatype/ompi_datatype_create_darray.c @@ -192,9 +192,7 @@ int32_t ompi_datatype_create_darray(int size, if (ndims < 1) { /* Don't just return MPI_DATATYPE_NULL as that can't be MPI_TYPE_FREE()ed, and that seems bad */ - *newtype = ompi_datatype_create(0); - ompi_datatype_add(*newtype, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return MPI_SUCCESS; + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newtype); } rc = ompi_datatype_type_extent(oldtype, &orig_extent); diff --git a/ompi/datatype/ompi_datatype_create_indexed.c b/ompi/datatype/ompi_datatype_create_indexed.c index 457efb1e6ff..2684d9d7df0 100644 --- a/ompi/datatype/ompi_datatype_create_indexed.c +++ b/ompi/datatype/ompi_datatype_create_indexed.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -34,24 +34,28 @@ int32_t ompi_datatype_create_indexed( int count, const int* pBlockLength, const int* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - if( 0 == count ) { + /* ignore all cases that lead to an empty type */ + ompi_datatype_type_size(oldType, &dLength); + for( i = 0; (i < count) && (0 == pBlockLength[i]); i++ ); /* find first non zero */ + if( (i == count) || (0 == dLength) ) { return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } - disp = pDisp[0]; - dLength = pBlockLength[0]; + disp = pDisp[i]; + dLength = pBlockLength[i]; endat = disp + dLength; ompi_datatype_type_extent( oldType, &extent ); - pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); - for( i = 1; i < count; i++ ) { - if( endat == pDisp[i] ) { - /* contiguous with the previsious */ + pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) ); + for( i += 1; i < count; i++ ) { + if( 0 == pBlockLength[i] ) /* ignore empty length */ + continue; + if( endat == pDisp[i] ) { /* contiguous with the previsious */ dLength += pBlockLength[i]; endat += pBlockLength[i]; } else { @@ -71,26 +75,28 @@ int32_t ompi_datatype_create_indexed( int count, const int* pBlockLength, const int32_t ompi_datatype_create_hindexed( int count, const int* pBlockLength, const ptrdiff_t* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + /* ignore all cases that lead to an empty type */ + ompi_datatype_type_size(oldType, &dLength); + for( i = 0; (i < count) && (0 == pBlockLength[i]); i++ ); /* find first non zero */ + if( (i == count) || (0 == dLength) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } ompi_datatype_type_extent( oldType, &extent ); - pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); - disp = pDisp[0]; - dLength = pBlockLength[0]; + disp = pDisp[i]; + dLength = pBlockLength[i]; endat = disp + dLength * extent; - for( i = 1; i < count; i++ ) { - if( endat == pDisp[i] ) { - /* contiguous with the previsious */ + pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) ); + for( i += 1; i < count; i++ ) { + if( 0 == pBlockLength[i] ) /* ignore empty length */ + continue; + if( endat == pDisp[i] ) { /* contiguous with the previsious */ dLength += pBlockLength[i]; endat += pBlockLength[i] * extent; } else { @@ -110,21 +116,15 @@ int32_t ompi_datatype_create_hindexed( int count, const int* pBlockLength, const int32_t ompi_datatype_create_indexed_block( int count, int bLength, const int* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - ompi_datatype_type_extent( oldType, &extent ); if( (count == 0) || (bLength == 0) ) { - if( 0 == count ) { - return ompi_datatype_duplicate(&ompi_mpi_datatype_null.dt, newType); - } else { - *newType = ompi_datatype_create(1); - ompi_datatype_add( *newType, oldType, 0, pDisp[0] * extent, extent ); - return OMPI_SUCCESS; - } + return ompi_datatype_duplicate(&ompi_mpi_datatype_null.dt, newType); } + ompi_datatype_type_extent( oldType, &extent ); pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); disp = pDisp[0]; dLength = bLength; @@ -150,34 +150,29 @@ int32_t ompi_datatype_create_indexed_block( int count, int bLength, const int* p int32_t ompi_datatype_create_hindexed_block( int count, int bLength, const ptrdiff_t* pDisp, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { - ompi_datatype_t* pdt; - int i; ptrdiff_t extent, disp, endat; + ompi_datatype_t* pdt; size_t dLength; + int i; - ompi_datatype_type_extent( oldType, &extent ); if( (count == 0) || (bLength == 0) ) { - *newType = ompi_datatype_create(1); - if( 0 == count ) - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0 ); - else - ompi_datatype_add( *newType, oldType, 0, pDisp[0] * extent, extent ); - return OMPI_SUCCESS; + return ompi_datatype_duplicate(&ompi_mpi_datatype_null.dt, newType); } + ompi_datatype_type_extent( oldType, &extent ); pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); disp = pDisp[0]; dLength = bLength; - endat = disp + dLength; + endat = disp + dLength * extent; for( i = 1; i < count; i++ ) { if( endat == pDisp[i] ) { /* contiguous with the previsious */ dLength += bLength; - endat += bLength; + endat += bLength * extent; } else { ompi_datatype_add( pdt, oldType, dLength, disp, extent ); disp = pDisp[i]; dLength = bLength; - endat = disp + bLength; + endat = disp + bLength * extent; } } ompi_datatype_add( pdt, oldType, dLength, disp, extent ); diff --git a/ompi/datatype/ompi_datatype_create_struct.c b/ompi/datatype/ompi_datatype_create_struct.c index 98daa8bacbb..9c78f53fee3 100644 --- a/ompi/datatype/ompi_datatype_create_struct.c +++ b/ompi/datatype/ompi_datatype_create_struct.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -31,27 +31,27 @@ int32_t ompi_datatype_create_struct( int count, const int* pBlockLength, const ptrdiff_t* pDisp, ompi_datatype_t* const * pTypes, ompi_datatype_t** newType ) { - int i; ptrdiff_t disp = 0, endto, lastExtent, lastDisp; - int lastBlock; ompi_datatype_t *pdt, *lastType; + int lastBlock; + int i, start_from; - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + /* Find first non-zero length element */ + for( i = 0; (i < count) && (0 == pBlockLength[i]); i++ ); + if( i == count ) { /* either nothing or nothing relevant */ + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } - - /* if we compute the total number of elements before we can + /* compute the total number of elements before we can * avoid increasing the size of the desc array often. */ - lastType = (ompi_datatype_t*)pTypes[0]; - lastBlock = pBlockLength[0]; + start_from = i; + lastType = (ompi_datatype_t*)pTypes[start_from]; + lastBlock = pBlockLength[start_from]; lastExtent = lastType->super.ub - lastType->super.lb; - lastDisp = pDisp[0]; - endto = pDisp[0] + lastExtent * lastBlock; + lastDisp = pDisp[start_from]; + endto = pDisp[start_from] + lastExtent * lastBlock; - for( i = 1; i < count; i++ ) { + for( i = (start_from + 1); i < count; i++ ) { if( (pTypes[i] == lastType) && (pDisp[i] == endto) ) { lastBlock += pBlockLength[i]; endto = lastDisp + lastBlock * lastExtent; @@ -68,16 +68,16 @@ int32_t ompi_datatype_create_struct( int count, const int* pBlockLength, const p disp += lastType->super.desc.used; if( lastBlock != 1 ) disp += 2; - lastType = (ompi_datatype_t*)pTypes[0]; - lastBlock = pBlockLength[0]; + lastType = (ompi_datatype_t*)pTypes[start_from]; + lastBlock = pBlockLength[start_from]; lastExtent = lastType->super.ub - lastType->super.lb; - lastDisp = pDisp[0]; - endto = pDisp[0] + lastExtent * lastBlock; + lastDisp = pDisp[start_from]; + endto = pDisp[start_from] + lastExtent * lastBlock; pdt = ompi_datatype_create( (int32_t)disp ); /* Do again the same loop but now add the elements */ - for( i = 1; i < count; i++ ) { + for( i = (start_from + 1); i < count; i++ ) { if( (pTypes[i] == lastType) && (pDisp[i] == endto) ) { lastBlock += pBlockLength[i]; endto = lastDisp + lastBlock * lastExtent; diff --git a/ompi/datatype/ompi_datatype_create_vector.c b/ompi/datatype/ompi_datatype_create_vector.c index 1de8df4d2d2..c4829a4b54c 100644 --- a/ompi/datatype/ompi_datatype_create_vector.c +++ b/ompi/datatype/ompi_datatype_create_vector.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -28,23 +28,14 @@ #include "ompi/datatype/ompi_datatype.h" -/* Open questions ... - * - how to improuve the handling of these vectors (creating a temporary datatype - * can be ONLY a initial solution. - * - */ - int32_t ompi_datatype_create_vector( int count, int bLength, int stride, const ompi_datatype_t* oldType, ompi_datatype_t** newType ) { ompi_datatype_t *pTempData, *pData; ptrdiff_t extent = oldType->super.ub - oldType->super.lb; - - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + if( (0 == count) || (0 == bLength) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } pData = ompi_datatype_create( oldType->super.desc.used + 2 ); @@ -72,10 +63,8 @@ int32_t ompi_datatype_create_hvector( int count, int bLength, ptrdiff_t stride, ompi_datatype_t *pTempData, *pData; ptrdiff_t extent = oldType->super.ub - oldType->super.lb; - if( 0 == count ) { - *newType = ompi_datatype_create( 0 ); - ompi_datatype_add( *newType, &ompi_mpi_datatype_null.dt, 0, 0, 0); - return OMPI_SUCCESS; + if( (0 == count) || (0 == bLength) ) { + return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); } pTempData = ompi_datatype_create( oldType->super.desc.used + 2 ); diff --git a/ompi/datatype/ompi_datatype_external.c b/ompi/datatype/ompi_datatype_external.c index d47531ef29e..53b907218cf 100644 --- a/ompi/datatype/ompi_datatype_external.c +++ b/ompi/datatype/ompi_datatype_external.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, @@ -26,7 +26,6 @@ #include #include "ompi/runtime/params.h" -#include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" #include "opal/datatype/opal_convertor.h" diff --git a/ompi/datatype/ompi_datatype_module.c b/ompi/datatype/ompi_datatype_module.c index 3ee09173cd8..33e8d9b9e92 100644 --- a/ompi/datatype/ompi_datatype_module.c +++ b/ompi/datatype/ompi_datatype_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -736,14 +736,14 @@ void ompi_datatype_dump( const ompi_datatype_t* pData ) length = length * 100 + 500; buffer = (char*)malloc( length ); index += snprintf( buffer, length - index, - "Datatype %p[%s] id %d size %ld align %d opal_id %d length %d used %d\n" - "true_lb %ld true_ub %ld (true_extent %ld) lb %ld ub %ld (extent %ld)\n" - "nbElems %d loops %d flags %X (", - (void*)pData, pData->name, pData->id, - (long)pData->super.size, (int)pData->super.align, pData->super.id, (int)pData->super.desc.length, (int)pData->super.desc.used, - (long)pData->super.true_lb, (long)pData->super.true_ub, (long)(pData->super.true_ub - pData->super.true_lb), - (long)pData->super.lb, (long)pData->super.ub, (long)(pData->super.ub - pData->super.lb), - (int)pData->super.nbElems, (int)pData->super.loops, (int)pData->super.flags ); + "Datatype %p[%s] id %d size %" PRIsize_t " align %u opal_id %u length %" PRIsize_t " used %" PRIsize_t "\n" + "true_lb %td true_ub %td (true_extent %td) lb %td ub %td (extent %td)\n" + "nbElems %" PRIsize_t " loops %u flags %X (", + (void*)pData, pData->name, pData->id, + pData->super.size, pData->super.align, (uint32_t)pData->super.id, pData->super.desc.length, pData->super.desc.used, + pData->super.true_lb, pData->super.true_ub, pData->super.true_ub - pData->super.true_lb, + pData->super.lb, pData->super.ub, pData->super.ub - pData->super.lb, + pData->super.nbElems, pData->super.loops, (int)pData->super.flags ); /* dump the flags */ if( ompi_datatype_is_predefined(pData) ) { index += snprintf( buffer + index, length - index, "predefined " ); diff --git a/ompi/mca/common/monitoring/common_monitoring.c b/ompi/mca/common/monitoring/common_monitoring.c index e521ca56417..ff252bf944f 100644 --- a/ompi/mca/common/monitoring/common_monitoring.c +++ b/ompi/mca/common/monitoring/common_monitoring.c @@ -268,7 +268,7 @@ void mca_common_monitoring_register(void*pml_monitoring_component) &mca_common_monitoring_enabled); mca_common_monitoring_current_state = mca_common_monitoring_enabled; - + (void)mca_base_var_register("ompi", "pml", "monitoring", "enable_output", "Enable the PML monitoring textual output at MPI_Finalize " "(it will be automatically turned off when MPIT is used to " @@ -278,7 +278,7 @@ void mca_common_monitoring_register(void*pml_monitoring_component) MCA_BASE_VAR_FLAG_DWG, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_common_monitoring_output_enabled); - + (void)mca_base_var_register("ompi", "pml", "monitoring", "filename", /*&mca_common_monitoring_component.pmlm_version, "filename",*/ "The name of the file where the monitoring information " @@ -292,7 +292,7 @@ void mca_common_monitoring_register(void*pml_monitoring_component) /* Now that the MCA variables are automatically unregistered when * their component close, we need to keep a safe copy of the - * filename. + * filename. * Keep the copy completely separated in order to let the initial * filename to be handled by the framework. It's easier to deal * with the string lifetime. diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index ce889f7e959..4754723f68a 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -324,8 +324,9 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv, return pConv->fAdvance( pConv, iov, out_size, max_data ); } -static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor, - size_t starting_point, const size_t* sizes ) +static inline int +opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor, + size_t starting_point, const size_t* sizes ) { dt_stack_t* pStack; /* pointer to the position on the stack */ const opal_datatype_t* pData = pConvertor->pDesc; @@ -349,14 +350,14 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pStack[0].disp = count * extent; /* now compute the number of pending bytes */ - count = starting_point - count * pData->size; + count = starting_point % pData->size; /** * We save the current displacement starting from the begining * of this data. */ if( OPAL_LIKELY(0 == count) ) { pStack[1].type = pElems->elem.common.type; - pStack[1].count = pElems->elem.count; + pStack[1].count = pElems->elem.blocklen; } else { pStack[1].type = OPAL_DATATYPE_UINT1; pStack[1].count = pData->size - count; @@ -370,9 +371,9 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* return OPAL_SUCCESS; } -static inline -int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, - const size_t* sizes ) +static inline int +opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, + const size_t* sizes ) { dt_stack_t* pStack = convertor->pStack; dt_elem_desc_t* pElems; @@ -402,7 +403,7 @@ int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, pStack[1].count = pElems[0].loop.loops; pStack[1].type = OPAL_DATATYPE_LOOP; } else { - pStack[1].count = pElems[0].elem.count; + pStack[1].count = pElems[0].elem.count * pElems[0].elem.blocklen; pStack[1].type = pElems[0].elem.common.type; } return OPAL_SUCCESS; @@ -578,8 +579,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, assert(! (convertor->flags & CONVERTOR_SEND)); OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ); - if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { - if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) { +#if defined(CHECKSUM) + if( OPAL_UNLIKELY(convertor->flags & CONVERTOR_WITH_CHECKSUM) ) { + if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_unpack_general_checksum; } else { if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { @@ -588,8 +590,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_unpack_checksum; } } - } else { - if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) { + } else +#endif /* defined(CHECKSUM) */ + if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_unpack_general; } else { if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { @@ -598,7 +601,6 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_unpack; } } - } return OPAL_SUCCESS; } @@ -617,6 +619,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ); +#if defined(CHECKSUM) if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_pack_general_checksum; @@ -631,7 +634,8 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_pack_checksum; } } - } else { + } else +#endif /* defined(CHECKSUM) */ if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { convertor->fAdvance = opal_pack_general; } else { @@ -645,7 +649,6 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, convertor->fAdvance = opal_generic_simple_pack; } } - } return OPAL_SUCCESS; } @@ -699,12 +702,12 @@ int opal_convertor_clone( const opal_convertor_t* source, void opal_convertor_dump( opal_convertor_t* convertor ) { - opal_output( 0, "Convertor %p count %" PRIsize_t" stack position %d bConverted %" PRIsize_t "\n" - "\tlocal_size %ld remote_size %ld flags %X stack_size %d pending_length %" PRIsize_t "\n" + opal_output( 0, "Convertor %p count %" PRIsize_t " stack position %u bConverted %" PRIsize_t "\n" + "\tlocal_size %" PRIsize_t " remote_size %" PRIsize_t " flags %X stack_size %u pending_length %" PRIsize_t "\n" "\tremote_arch %u local_arch %u\n", (void*)convertor, convertor->count, convertor->stack_pos, convertor->bConverted, - (unsigned long)convertor->local_size, (unsigned long)convertor->remote_size, + convertor->local_size, convertor->remote_size, convertor->flags, convertor->stack_size, convertor->partial_length, convertor->remoteArch, opal_local_arch ); if( convertor->flags & CONVERTOR_RECV ) opal_output( 0, "unpack "); diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h index 875c111b1f1..b24d94c37b0 100644 --- a/opal/datatype/opal_convertor.h +++ b/opal/datatype/opal_convertor.h @@ -332,8 +332,10 @@ opal_convertor_set_position( opal_convertor_t* convertor, /* Remove the completed flag if it's already set */ convertor->flags &= ~CONVERTOR_COMPLETED; - if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && - (convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) && + if( (convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) && +#if defined(CHECKSUM) + !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && +#endif /* defined(CHECKSUM) */ (convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { /* Contiguous and no checkpoint and no homogeneous unpack */ convertor->bConverted = *position; diff --git a/opal/datatype/opal_convertor_internal.h b/opal/datatype/opal_convertor_internal.h index 025633cb7e7..39690f5bd19 100644 --- a/opal/datatype/opal_convertor_internal.h +++ b/opal/datatype/opal_convertor_internal.h @@ -50,11 +50,6 @@ opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_a void opal_convertor_destroy_masters( void ); -#if OPAL_ENABLE_DEBUG -extern bool opal_pack_debug; -extern bool opal_unpack_debug; -#endif /* OPAL_ENABLE_DEBUG */ - END_C_DECLS #endif /* OPAL_CONVERTOR_INTERNAL_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 28022809679..5bea5dcf5b8 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -25,29 +25,53 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_pack_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_raw_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ +/* Take a new iovec (base + len) and try to merge it with what we already + * have. If we succeed return 0 and move forward, otherwise save it into a new + * iovec location. If we need to advance position and we reach the end + * of the iovec array, return 1 to signal we did not saved the last iovec. + */ +static inline int +opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count, + IOVBASE_TYPE* base, size_t len, + uint32_t* idx ) +{ + if( 0 != iov[*idx].iov_len ) { + if( (base == ((char*)iov[*idx].iov_base + iov[*idx].iov_len)) ) { + iov[*idx].iov_len += len; /* merge with previous iovec */ + return 0; + } /* cannot merge, move to the next position */ + *idx = *idx + 1; + if( *idx == *iov_count ) return 1; /* do not overwrite outside the iovec array boundaries */ + } + iov[*idx].iov_base = base; + iov[*idx].iov_len = len; + return 0; +} + /** * This function always work in local representation. This means no representation - * conversion (i.e. no heterogeneity) has to be taken into account, and that all + * conversion (i.e. no heterogeneity) is taken into account, and that all * length we're working on are local. */ int32_t opal_convertor_raw( opal_convertor_t* pConvertor, - struct iovec* iov, uint32_t* iov_count, - size_t* length ) + struct iovec* iov, uint32_t* iov_count, + size_t* length ) { const opal_datatype_t *pData = pConvertor->pDesc; dt_stack_t* pStack; /* pointer to the position on the stack */ uint32_t pos_desc; /* actual position in the description of the derived datatype */ size_t count_desc; /* the number of items already done in the actual pos_desc */ + size_t do_now, blength; dt_elem_desc_t* description, *pElem; unsigned char *source_base; /* origin of the data */ - size_t raw_data = 0; /* sum of raw data lengths in the iov_len fields */ - uint32_t index = 0; /* the iov index and a simple counter */ + size_t sum_iov_len = 0; /* sum of raw data lengths in the iov_len fields */ + uint32_t index = 0; /* the iov index and a simple counter */ assert( (*iov_count) > 0 ); if( OPAL_LIKELY(pConvertor->flags & CONVERTOR_COMPLETED) ) { @@ -77,9 +101,9 @@ opal_convertor_raw( opal_convertor_t* pConvertor, description = pConvertor->use_desc->desc; /* For the first step we have to add both displacement to the source. After in the - * main while loop we will set back the source_base to the correct value. This is - * due to the fact that the convertor can stop in the middle of a data with a count - */ + * main while loop we will set back the source_base to the correct value. This is + * due to the fact that the convertor can stop in the middle of a data with a count + */ pStack = pConvertor->pStack + pConvertor->stack_pos; pos_desc = pStack->index; source_base = pConvertor->pBaseBuf + pStack->disp; @@ -87,64 +111,86 @@ opal_convertor_raw( opal_convertor_t* pConvertor, pStack--; pConvertor->stack_pos--; pElem = &(description[pos_desc]); - source_base += pStack->disp; + DO_DEBUG( opal_output( 0, "raw start pos_desc %d count_desc %" PRIsize_t " disp %ld\n" "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n", pos_desc, count_desc, (long)(source_base - pConvertor->pBaseBuf), pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); ); + + iov[index].iov_len = 0; + /* Special case if we start from a position that is in the middle of a data element blocklen. + * We can treat this outside the loop as it is an exception that can only happen once, + * and will simplify the loop handling. + */ + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + const ddt_elem_desc_t* current = &(pElem->elem); + + if( count_desc != (current->count * current->blocklen) ) { /* Not the full element description */ + if( (do_now = count_desc % current->blocklen) ) { + do_now = current->blocklen - do_now; /* how much left in the block */ + source_base += current->disp; + blength = do_now * opal_datatype_basicDatatypes[current->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, + pConvertor->pDesc, pConvertor->count ); + DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", + index, (void*)source_base, blength ); ); + opal_convertor_merge_iov( iov, iov_count, + (IOVBASE_TYPE *) source_base, blength, &index ); + /* ignore the return value, we know there was at least one element in the iovec */ + sum_iov_len += blength; + count_desc -= do_now; + + source_base += (blength - current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size + + current->extent - current->disp); + } + } + } + while( 1 ) { while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - size_t blength = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; - source_base += pElem->elem.disp; - if( blength == (size_t)pElem->elem.extent ) { /* no resized data */ - if( index < *iov_count ) { - blength *= count_desc; - /* now here we have a basic datatype */ - OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); - DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", - index, (void*)source_base, (unsigned long)blength ); ); - iov[index].iov_base = (IOVBASE_TYPE *) source_base; - iov[index].iov_len = blength; - source_base += blength; - raw_data += blength; - index++; - count_desc = 0; - } - } else { - for(size_t i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); - DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", - index, (void*)source_base, (unsigned long)blength ); ); - iov[index].iov_base = (IOVBASE_TYPE *) source_base; - iov[index].iov_len = blength; - source_base += pElem->elem.extent; - raw_data += blength; - count_desc--; - } + const ddt_elem_desc_t* current = &(pElem->elem); + source_base += current->disp; + + do_now = current->count; + if( count_desc != (current->count * current->blocklen) ) { + do_now = count_desc / current->blocklen; + assert( 0 == (count_desc % current->blocklen) ); + } + + blength = current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size; + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, + pConvertor->pDesc, pConvertor->count ); + DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", + index, (void*)source_base, blength ); ); + if( opal_convertor_merge_iov( iov, iov_count, + (IOVBASE_TYPE *) source_base, blength, &index ) ) + break; /* no more iovec available, bail out */ + + source_base += current->extent; + sum_iov_len += blength; + count_desc -= current->blocklen; } - source_base -= pElem->elem.disp; + if( 0 == count_desc ) { /* completed */ source_base = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); continue; } + source_base -= current->disp; goto complete_loop; } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "raw end_loop count %" PRIsize_t " stack_pos %d" - " pos_desc %d disp %ld space %lu\n", + " pos_desc %d disp %ld space %" PRIsize_t "\n", pStack->count, pConvertor->stack_pos, - pos_desc, (long)pStack->disp, (unsigned long)raw_data ); ); + pos_desc, (long)pStack->disp, sum_iov_len ); ); if( --(pStack->count) == 0 ) { /* end of loop */ - if( pConvertor->stack_pos == 0 ) { - /* we lie about the size of the next element in order to - * make sure we exit the main loop. - */ - *iov_count = index; - goto complete_loop; /* completed */ + if( 0 == pConvertor->stack_pos ) { + /* we're done. Force the exit of the main for loop (around iovec) */ + index++; /* account for the currently updating iovec */ + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -155,15 +201,15 @@ opal_convertor_raw( opal_convertor_t* pConvertor, pStack->disp += (pData->ub - pData->lb); } else { assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); - pStack->disp += description[pStack->index].loop.extent; + pStack->disp += description[pStack->index].loop.extent; /* jump by the loop extent */ } } source_base = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DO_DEBUG( opal_output( 0, "raw new_loop count %" PRIsize_t " stack_pos %d " - "pos_desc %d disp %ld space %lu\n", + "pos_desc %d disp %ld space %" PRIsize_t "\n", pStack->count, pConvertor->stack_pos, - pos_desc, (long)pStack->disp, (unsigned long)raw_data ); ); + pos_desc, (long)pStack->disp, sum_iov_len ); ); } if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { ptrdiff_t local_disp = (ptrdiff_t)source_base; @@ -172,42 +218,39 @@ opal_convertor_raw( opal_convertor_t* pConvertor, if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { ptrdiff_t offset = end_loop->first_elem_disp; source_base += offset; - for(size_t i = MIN(count_desc, *iov_count - index); i > 0; i--, index++ ) { + for(; count_desc > 0; ) { OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf, - pConvertor->pDesc, pConvertor->count ); - iov[index].iov_base = (IOVBASE_TYPE *) source_base; - iov[index].iov_len = end_loop->size; + pConvertor->pDesc, pConvertor->count ); + if( opal_convertor_merge_iov( iov, iov_count, + (IOVBASE_TYPE *) source_base, end_loop->size, &index ) ) { + source_base -= offset; + goto complete_loop; + } + source_base += pElem->loop.extent; - raw_data += end_loop->size; + sum_iov_len += end_loop->size; count_desc--; DO_DEBUG( opal_output( 0, "raw contig loop generate iov[%d] = {base %p, length %" PRIsize_t "}" - "space %lu [pos_desc %d]\n", + "space %" PRIsize_t " [pos_desc %d]\n", index, iov[index].iov_base, iov[index].iov_len, - (unsigned long)raw_data, pos_desc ); ); + sum_iov_len, pos_desc ); ); } source_base -= offset; - if( 0 == count_desc ) { /* completed */ - pos_desc += pElem->loop.items + 1; - goto update_loop_description; - } - } - if( index == *iov_count ) { /* all iov have been filled, we need to bail out */ - goto complete_loop; + pos_desc += pElem->loop.items + 1; + } else { + local_disp = (ptrdiff_t)source_base - local_disp; + PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, + pStack->disp + local_disp); + pos_desc++; } - local_disp = (ptrdiff_t)source_base - local_disp; - PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, - pStack->disp + local_disp); - pos_desc++; - update_loop_description: /* update the current state */ source_base = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); - continue; } } -complete_loop: - pConvertor->bConverted += raw_data; /* update the already converted bytes */ - *length = raw_data; + complete_loop: + pConvertor->bConverted += sum_iov_len; /* update the already converted bytes */ + *length = sum_iov_len; *iov_count = index; if( pConvertor->bConverted == pConvertor->local_size ) { pConvertor->flags |= CONVERTOR_COMPLETED; @@ -215,7 +258,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, } /* I complete an element, next step I should go to the next one */ PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc, - source_base - pStack->disp - pConvertor->pBaseBuf ); + source_base - pConvertor->pBaseBuf ); DO_DEBUG( opal_output( 0, "raw save stack stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n", pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); ); return 0; diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index a836a5aae03..e1bc18c67f9 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -224,13 +224,41 @@ opal_datatype_is_contiguous_memory_layout( const opal_datatype_t* datatype, int3 } -OPAL_DECLSPEC void opal_datatype_dump( const opal_datatype_t* pData ); +OPAL_DECLSPEC void +opal_datatype_dump( const opal_datatype_t* pData ); + /* data creation functions */ -OPAL_DECLSPEC int32_t opal_datatype_clone( const opal_datatype_t * src_type, opal_datatype_t * dest_type ); -OPAL_DECLSPEC int32_t opal_datatype_create_contiguous( int count, const opal_datatype_t* oldType, opal_datatype_t** newType ); -OPAL_DECLSPEC int32_t opal_datatype_resize( opal_datatype_t* type, ptrdiff_t lb, ptrdiff_t extent ); -OPAL_DECLSPEC int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtAdd, size_t count, - ptrdiff_t disp, ptrdiff_t extent ); + +/** + * Create a duplicate of the source datatype. + */ +OPAL_DECLSPEC int32_t +opal_datatype_clone( const opal_datatype_t* src_type, + opal_datatype_t* dest_type ); +/** + * A contiguous array of identical datatypes. + */ +OPAL_DECLSPEC int32_t +opal_datatype_create_contiguous( int count, const opal_datatype_t* oldType, + opal_datatype_t** newType ); +/** + * Add a new datatype to the base type description. The count is the number + * repetitions of the same element to be added, and the extent is the extent + * of each element. The displacement is the initial displacement of the + * first element. + */ +OPAL_DECLSPEC int32_t +opal_datatype_add( opal_datatype_t* pdtBase, + const opal_datatype_t* pdtAdd, size_t count, + ptrdiff_t disp, ptrdiff_t extent ); + +/** + * Alter the lb and extent of an existing datatype in place. + */ +OPAL_DECLSPEC int32_t +opal_datatype_resize( opal_datatype_t* type, + ptrdiff_t lb, + ptrdiff_t extent ); static inline int32_t opal_datatype_type_lb( const opal_datatype_t* pData, ptrdiff_t* disp ) diff --git a/opal/datatype/opal_datatype_add.c b/opal/datatype/opal_datatype_add.c index 146ce12afe2..108b4e3d1be 100644 --- a/opal/datatype/opal_datatype_add.c +++ b/opal/datatype/opal_datatype_add.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -281,15 +281,23 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA if( (pdtAdd->flags & (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA)) == (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA) ) { if( NULL != pdtBase->ptypes ) pdtBase->ptypes[pdtAdd->id] += count; + + pLast->elem.common.flags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED); pLast->elem.common.type = pdtAdd->id; - pLast->elem.count = count; pLast->elem.disp = disp; - pLast->elem.extent = extent; - pdtBase->desc.used++; - pLast->elem.common.flags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED); - if( (extent != (ptrdiff_t)pdtAdd->size) && (count > 1) ) { /* gaps around the datatype */ - pLast->elem.common.flags &= ~(OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS); + pLast->elem.extent = count * extent; + /* assume predefined datatypes without extent, aka. contiguous */ + pLast->elem.count = 1; + pLast->elem.blocklen = count; + if( extent != (ptrdiff_t)pdtAdd->size ) { /* not contiguous: let's fix */ + pLast->elem.count = count; + pLast->elem.blocklen = 1; + pLast->elem.extent = extent; + if( count > 1 ) { /* gaps around the predefined datatype */ + pLast->elem.common.flags &= ~(OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS); + } } + pdtBase->desc.used++; } else { /* keep trace of the total number of basic datatypes in the datatype definition */ pdtBase->loops += pdtAdd->loops; @@ -299,13 +307,40 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) if( pdtAdd->ptypes[i] != 0 ) pdtBase->ptypes[i] += (count * pdtAdd->ptypes[i]); } - if( (1 == pdtAdd->desc.used) && (extent == (pdtAdd->ub - pdtAdd->lb)) && - (extent == pdtAdd->desc.desc[0].elem.extent) ){ + if( 1 == pdtAdd->desc.used ) { pLast->elem = pdtAdd->desc.desc[0].elem; - pLast->elem.count *= count; pLast->elem.disp += disp; + if( 1 == count ) { + /* Extent only has a meaning when there are multiple elements. Bail out */ + } else if( 1 == pLast->elem.count ) { + /* The size and true_extent of the added datatype are identical, signaling a datatype + * that is mostly contiguous with the exception of the initial and final gaps. These + * gaps do not matter here as they will amended (the initial gaps being shifted by the + * new displacement and the final gap being replaced with the new gap + */ + if( pdtAdd->desc.desc[0].elem.extent == extent ) { + /* pure bliss everything is fully contiguous and we can collapse + * everything by updating the blocklen and extent + */ + pLast->elem.blocklen *= count; + pLast->elem.extent *= count; + } else { + pLast->elem.count = count; + pLast->elem.extent = extent; + } + } else if( extent == (ptrdiff_t)(pLast->elem.count * pLast->elem.extent) ) { + /* It's just a repetition of the same element, increase the count */ + pLast->elem.count *= count; + } else { + /* No luck here, no optimization can be applied. Fall back to the + * normal case where we add a loop around the datatype. + */ + goto build_loop; + } pdtBase->desc.used++; } else { + +build_loop: /* if the extent of the datatype is the same as the extent of the loop * description of the datatype then we simply have to update the main loop. */ diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index 7bf94ef97b9..c70bdd24dfa 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -36,7 +36,7 @@ #if OPAL_ENABLE_DEBUG -#define DO_DEBUG(INST) if( opal_copy_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_copy_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h index 7aeac8e63ec..11058012e1e 100644 --- a/opal/datatype/opal_datatype_copy.h +++ b/opal/datatype/opal_datatype_copy.h @@ -48,37 +48,33 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, unsigned char* DESTINATION, size_t* SPACE ) { - size_t _copy_count = (COUNT); - size_t _copy_blength; const ddt_elem_desc_t* _elem = &((ELEM)->elem); unsigned char* _source = (SOURCE) + _elem->disp; unsigned char* _destination = (DESTINATION) + _elem->disp; + size_t do_now = _elem->count, do_now_bytes; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; + assert( (COUNT) == (do_now * _elem->blocklen)); - if( _copy_blength == (size_t)_elem->extent ) { - _copy_blength *= _copy_count; - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - /* the extent and the size of the basic datatype are equals */ - DO_DEBUG( opal_output( 0, "copy 1. %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, _copy_blength, *(SPACE) ); ); - MEM_OP( _destination, _source, _copy_blength ); - _source += _copy_blength; - _destination += _copy_blength; - } else { - for(size_t _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy 2. %s( %p, %p, %lu ) => space %lu\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEM_OP( _destination, _source, _copy_blength ); - _source += _elem->extent; - _destination += _elem->extent; - } - _copy_blength *= _copy_count; + /* We don't a prologue and epilogue here as we are __always__ working + * with full copies of the data description. + */ + + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + assert( (do_now * do_now_bytes) <= (*SPACE) ); + + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", + STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); ); + MEM_OP( _destination, _source, do_now_bytes ); + _destination += _elem->extent; + _source += _elem->extent; } - *(SPACE) -= _copy_blength; + *(SPACE) -= (do_now_bytes * do_now); } static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, @@ -147,12 +143,10 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i if( (ptrdiff_t)datatype->size == extent ) { /* all contiguous == no gaps around */ size_t total_length = iov_len_local; size_t memop_chunk = opal_datatype_memop_block_size; + OPAL_DATATYPE_SAFEGUARD_POINTER( source, iov_len_local, + (unsigned char*)source_base, datatype, count ); while( total_length > 0 ) { if( memop_chunk > total_length ) memop_chunk = total_length; - OPAL_DATATYPE_SAFEGUARD_POINTER( destination, memop_chunk, - (unsigned char*)destination_base, datatype, count ); - OPAL_DATATYPE_SAFEGUARD_POINTER( source, memop_chunk, - (unsigned char*)source_base, datatype, count ); DO_DEBUG( opal_output( 0, "copy c1. %s( %p, %p, %lu ) => space %lu\n", STRINGIFY(MEM_OP_NAME), (void*)destination, (void*)source, (unsigned long)memop_chunk, (unsigned long)total_length ); ); MEM_OP( destination, source, memop_chunk ); @@ -184,17 +178,12 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i pos_desc = 0; stack_pos = 0; - if( datatype->opt_desc.desc != NULL ) { - description = datatype->opt_desc.desc; - } else { + description = datatype->opt_desc.desc; + if( NULL == description ) { description = datatype->desc.desc; } - if( description[0].elem.common.type == OPAL_DATATYPE_LOOP ) - count_desc = description[0].loop.loops; - else - count_desc = description[0].elem.count; - pElem = &(description[pos_desc]); + UPDATE_INTERNAL_COUNTERS( description, 0, pElem, count_desc ); while( 1 ) { while( OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) ) { diff --git a/opal/datatype/opal_datatype_dump.c b/opal/datatype/opal_datatype_dump.c index 4c26292b8be..7782a805d0a 100644 --- a/opal/datatype/opal_datatype_dump.c +++ b/opal/datatype/opal_datatype_dump.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -64,7 +64,7 @@ int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t len int index = 0; if( length < 22 ) return 0; index = snprintf( ptr, 22, "-----------[---][---]" ); /* set everything to - */ - if( usflags & OPAL_DATATYPE_FLAG_COMMITTED ) ptr[1] = 'c'; + if( usflags & OPAL_DATATYPE_FLAG_COMMITTED ) ptr[1] = 'c'; if( usflags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) ptr[2] = 'C'; if( usflags & OPAL_DATATYPE_FLAG_OVERLAP ) ptr[3] = 'o'; if( usflags & OPAL_DATATYPE_FLAG_USER_LB ) ptr[4] = 'l'; @@ -90,17 +90,17 @@ int opal_datatype_dump_data_desc( dt_elem_desc_t* pDesc, int nbElems, char* ptr, index += snprintf( ptr + index, length - index, "%15s ", opal_datatype_basicDatatypes[pDesc->elem.common.type]->name ); if( length <= (size_t)index ) break; if( OPAL_DATATYPE_LOOP == pDesc->elem.common.type ) - index += snprintf( ptr + index, length - index, "%d times the next %d elements extent %d\n", - (int)pDesc->loop.loops, (int)pDesc->loop.items, - (int)pDesc->loop.extent ); + index += snprintf( ptr + index, length - index, "%u times the next %u elements extent %td\n", + pDesc->loop.loops, pDesc->loop.items, + pDesc->loop.extent ); else if( OPAL_DATATYPE_END_LOOP == pDesc->elem.common.type ) - index += snprintf( ptr + index, length - index, "prev %d elements first elem displacement %ld size of data %d\n", - (int)pDesc->end_loop.items, (long)pDesc->end_loop.first_elem_disp, - (int)pDesc->end_loop.size ); + index += snprintf( ptr + index, length - index, "prev %u elements first elem displacement %td size of data %" PRIsize_t "\n", + pDesc->end_loop.items, pDesc->end_loop.first_elem_disp, + pDesc->end_loop.size ); else - index += snprintf( ptr + index, length - index, "count %" PRIsize_t " disp 0x%lx (%ld) blen %d extent %ld (size %ld)\n", - pDesc->elem.count, (long)pDesc->elem.disp, (long)pDesc->elem.disp, (int)pDesc->elem.blocklen, - pDesc->elem.extent, (long)(pDesc->elem.count * opal_datatype_basicDatatypes[pDesc->elem.common.type]->size) ); + index += snprintf( ptr + index, length - index, "count %" PRIsize_t " disp 0x%tx (%td) blen %u extent %td (size %zd)\n", + pDesc->elem.count, pDesc->elem.disp, pDesc->elem.disp, pDesc->elem.blocklen, + pDesc->elem.extent, (pDesc->elem.count * pDesc->elem.blocklen * opal_datatype_basicDatatypes[pDesc->elem.common.type]->size) ); pDesc++; if( length <= (size_t)index ) break; @@ -118,13 +118,13 @@ void opal_datatype_dump( const opal_datatype_t* pData ) length = pData->opt_desc.used + pData->desc.used; length = length * 100 + 500; buffer = (char*)malloc( length ); - index += snprintf( buffer, length - index, "Datatype %p[%s] size %ld align %d id %d length %d used %d\n" - "true_lb %ld true_ub %ld (true_extent %ld) lb %ld ub %ld (extent %ld)\n" - "nbElems %" PRIsize_t " loops %d flags %X (", - (void*)pData, pData->name, (long)pData->size, (int)pData->align, pData->id, (int)pData->desc.length, (int)pData->desc.used, - (long)pData->true_lb, (long)pData->true_ub, (long)(pData->true_ub - pData->true_lb), - (long)pData->lb, (long)pData->ub, (long)(pData->ub - pData->lb), - pData->nbElems, (int)pData->loops, (int)pData->flags ); + index += snprintf( buffer, length - index, "Datatype %p[%s] size %" PRIsize_t " align %u id %u length %" PRIsize_t " used %" PRIsize_t "\n" + "true_lb %td true_ub %td (true_extent %td) lb %td ub %td (extent %td)\n" + "nbElems %" PRIsize_t " loops %u flags %X (", + (void*)pData, pData->name, pData->size, pData->align, (uint32_t)pData->id, pData->desc.length, pData->desc.used, + pData->true_lb, pData->true_ub, pData->true_ub - pData->true_lb, + pData->lb, pData->ub, pData->ub - pData->lb, + pData->nbElems, pData->loops, (int)pData->flags ); /* dump the flags */ if( pData->flags == OPAL_DATATYPE_FLAG_PREDEFINED ) index += snprintf( buffer + index, length - index, "predefined " ); diff --git a/opal/datatype/opal_datatype_get_count.c b/opal/datatype/opal_datatype_get_count.c index ae085c42704..f75b86d0e2d 100644 --- a/opal/datatype/opal_datatype_get_count.c +++ b/opal/datatype/opal_datatype_get_count.c @@ -69,14 +69,14 @@ ssize_t opal_datatype_get_element_count( const opal_datatype_t* datatype, size_t while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); - local_size = pElems[pos_desc].elem.count * basic_type->size; + local_size = (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen) * basic_type->size; if( local_size >= iSize ) { local_size = iSize / basic_type->size; nbElems += (int32_t)local_size; iSize -= local_size * basic_type->size; return (iSize == 0 ? nbElems : -1); } - nbElems += pElems[pos_desc].elem.count; + nbElems += (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen); iSize -= local_size; pos_desc++; /* advance to the next data */ } @@ -131,7 +131,7 @@ int32_t opal_datatype_set_element_count( const opal_datatype_t* datatype, size_t while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); - local_length = pElems[pos_desc].elem.count; + local_length = (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen); if( local_length >= count ) { *length += count * basic_type->size; return 0; @@ -188,8 +188,8 @@ int opal_datatype_compute_ptypes( opal_datatype_t* datatype ) } while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ - datatype->ptypes[pElems[pos_desc].elem.common.type] += pElems[pos_desc].elem.count; - nbElems += pElems[pos_desc].elem.count; + datatype->ptypes[pElems[pos_desc].elem.common.type] += pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen; + nbElems += pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen; DUMP( " compute_ptypes-add: type %d count %"PRIsize_t" (total type %"PRIsize_t" total %lld)\n", pElems[pos_desc].elem.common.type, datatype->ptypes[pElems[pos_desc].elem.common.type], diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index bc3f8aa7cab..bdeb0cc429e 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -215,19 +215,23 @@ union dt_elem_desc { /** - * Create one or more elements depending on the value of _count. If the value - * is too large for the type of elem.count then use oth the elem.count and - * elem.blocklen to create it. If the number is prime then create a second - * element to account for the difference. + * Create an element entry in the description. If the element is contiguous + * collapse everything into the blocklen. */ -#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ +#define CREATE_ELEM(_place, _type, _flags, _blocklen, _count, _disp, _extent) \ do { \ (_place)->elem.common.flags = (_flags) | OPAL_DATATYPE_FLAG_DATA; \ (_place)->elem.common.type = (_type); \ - (_place)->elem.disp = (_disp); \ - (_place)->elem.extent = (_extent); \ + (_place)->elem.blocklen = (_blocklen); \ (_place)->elem.count = (_count); \ - (_place)->elem.blocklen = 1; \ + (_place)->elem.extent = (_extent); \ + (_place)->elem.disp = (_disp); \ + if( _extent == (ptrdiff_t)(_blocklen * opal_datatype_basicDatatypes[_type]->size) ) { \ + /* collapse it into a single large blocklen */ \ + (_place)->elem.blocklen *= _count; \ + (_place)->elem.extent *= _count; \ + (_place)->elem.count = 1; \ + } \ } while(0) /* * This array holds the descriptions desc.desc[2] of the predefined basic datatypes. @@ -480,22 +484,23 @@ static inline int GET_FIRST_NON_LOOP( const union dt_elem_desc* _pElem ) } #define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \ - do { \ - (ELEMENT) = &((DESCRIPTION)[(POSITION)]); \ - if( OPAL_DATATYPE_LOOP == (ELEMENT)->elem.common.type ) \ - (COUNTER) = (ELEMENT)->loop.loops; \ - else \ - (COUNTER) = (ELEMENT)->elem.count; \ + do { \ + (ELEMENT) = &((DESCRIPTION)[(POSITION)]); \ + if( OPAL_DATATYPE_LOOP == (ELEMENT)->elem.common.type ) \ + (COUNTER) = (ELEMENT)->loop.loops; \ + else \ + (COUNTER) = (ELEMENT)->elem.count * (ELEMENT)->elem.blocklen; \ } while (0) OPAL_DECLSPEC int opal_datatype_contain_basic_datatypes( const struct opal_datatype_t* pData, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_desc( union dt_elem_desc* pDesc, int nbElems, char* ptr, size_t length ); -#if OPAL_ENABLE_DEBUG -extern bool opal_position_debug; -extern bool opal_copy_debug; -#endif /* OPAL_ENABLE_DEBUG */ +extern bool opal_ddt_position_debug; +extern bool opal_ddt_copy_debug; +extern bool opal_ddt_unpack_debug; +extern bool opal_ddt_pack_debug; +extern bool opal_ddt_raw_debug; END_C_DECLS #endif /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index 2d8dedc94e7..ba933b5fe2b 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -37,10 +37,11 @@ /* by default the debuging is turned off */ int opal_datatype_dfd = -1; -bool opal_unpack_debug = false; -bool opal_pack_debug = false; -bool opal_position_debug = false; -bool opal_copy_debug = false; +bool opal_ddt_unpack_debug = false; +bool opal_ddt_pack_debug = false; +bool opal_ddt_position_debug = false; +bool opal_ddt_copy_debug = false; +bool opal_ddt_raw_debug = false; int opal_ddt_verbose = -1; /* Has the datatype verbose it's own output stream */ extern int opal_cuda_verbose; @@ -148,35 +149,43 @@ int opal_datatype_register_params(void) int ret; ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_unpack_debug", - "Whether to output debugging information in the ddt unpack functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_unpack_debug); + "Whether to output debugging information in the ddt unpack functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_unpack_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_pack_debug", - "Whether to output debugging information in the ddt pack functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_pack_debug); + "Whether to output debugging information in the ddt pack functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_pack_debug); if (0 > ret) { - return ret; + return ret; + } + + ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_raw_debug", + "Whether to output debugging information in the ddt raw functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_raw_debug); + if (0 > ret) { + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_position_debug", - "Non zero lead to output generated by the datatype position functions", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_position_debug); + "Non zero lead to output generated by the datatype position functions", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_position_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_copy_debug", - "Whether to output debugging information in the ddt copy functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_copy_debug); + "Whether to output debugging information in the ddt copy functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_ddt_copy_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "opal", NULL, "ddt_verbose", @@ -195,7 +204,7 @@ int opal_datatype_register_params(void) OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &opal_cuda_verbose); if (0 > ret) { - return ret; + return ret; } #endif @@ -224,8 +233,8 @@ int32_t opal_datatype_init( void ) OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS; datatype->desc.desc[0].elem.common.type = i; - /* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */ datatype->desc.desc[0].elem.count = 1; + datatype->desc.desc[0].elem.blocklen = 1; datatype->desc.desc[0].elem.disp = 0; datatype->desc.desc[0].elem.extent = datatype->size; diff --git a/opal/datatype/opal_datatype_monotonic.c b/opal/datatype/opal_datatype_monotonic.c index b467d95ecbe..247fd66142d 100644 --- a/opal/datatype/opal_datatype_monotonic.c +++ b/opal/datatype/opal_datatype_monotonic.c @@ -2,6 +2,9 @@ /* * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2018-2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,35 +21,43 @@ #include "opal/datatype/opal_datatype_internal.h" #include "opal/datatype/opal_convertor.h" +#define OPAL_DATATYPE_MAX_MONOTONIC_IOVEC 32 + +/** + * Check if the datatype describes a memory layout where the pointers to + * the contiguous pieces are always advancing in the same direction, i.e. + * there is no potential for overlap. + */ int32_t opal_datatype_is_monotonic(opal_datatype_t* type ) { + struct iovec iov[OPAL_DATATYPE_MAX_MONOTONIC_IOVEC]; + ptrdiff_t upper_limit = (ptrdiff_t)type->true_lb; /* as conversion base will be NULL the first address is true_lb */ + size_t max_data = 0x7FFFFFFF; opal_convertor_t *pConv; + bool monotonic = true; uint32_t iov_count; - struct iovec iov[5]; - size_t max_data = 0; - long prev = -1; int rc; - bool monotonic = true; pConv = opal_convertor_create( opal_local_arch, 0 ); if (OPAL_UNLIKELY(NULL == pConv)) { - return 0; + return -1; } rc = opal_convertor_prepare_for_send( pConv, type, 1, NULL ); if( OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(pConv); - return 0; + return -1; } do { - iov_count = 5; + iov_count = OPAL_DATATYPE_MAX_MONOTONIC_IOVEC; rc = opal_convertor_raw( pConv, iov, &iov_count, &max_data); - for (uint32_t i=0; icommon.flags = OPAL_DATATYPE_FLAG_BASIC; \ - _elem->common.type = OPAL_DATATYPE_LOOP; \ - _elem->count = 0; \ - _elem->disp = 0; \ - _elem->extent = 0; \ - } while (0) - static int32_t opal_datatype_optimize_short( opal_datatype_t* pData, size_t count, dt_type_desc_t* pTypeDesc ) { dt_elem_desc_t* pElemDesc; - ddt_elem_desc_t opt_elem; - dt_stack_t* pOrigStack; - dt_stack_t* pStack; /* pointer to the position on the stack */ - int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ - int32_t stack_pos = 0, last_type = OPAL_DATATYPE_UINT1; - int32_t type = OPAL_DATATYPE_LOOP, nbElems = 0, continuity; - ptrdiff_t total_disp = 0, last_extent = 1, last_disp = 0; - uint16_t last_flags = 0xFFFF; /* keep all for the first datatype */ - uint32_t i; - size_t last_length = 0; + dt_stack_t *pOrigStack, *pStack; /* pointer to the position on the stack */ + int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ + int32_t stack_pos = 0; + int32_t nbElems = 0; + ptrdiff_t total_disp = 0; + ddt_elem_desc_t last = {.common.flags = 0xFFFF /* all on */, .count = 0, .disp = 0}, compress; + ddt_elem_desc_t* current; pOrigStack = pStack = (dt_stack_t*)malloc( sizeof(dt_stack_t) * (pData->loops+2) ); SAVE_STACK( pStack, -1, 0, count, 0 ); @@ -64,186 +51,199 @@ opal_datatype_optimize_short( opal_datatype_t* pData, pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length ); pTypeDesc->used = 0; - SET_EMPTY_ELEMENT( &opt_elem ); assert( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pData->desc.used].elem.common.type ); - opt_elem.common.type = OPAL_DATATYPE_LOOP; - opt_elem.common.flags = 0xFFFF; /* keep all for the first datatype */ - opt_elem.count = 0; - opt_elem.disp = pData->desc.desc[pData->desc.used].end_loop.first_elem_disp; - opt_elem.extent = 0; while( stack_pos >= 0 ) { if( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */ ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop); - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); + if( 0 != last.count ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; - last_disp += last_length; - last_length = 0; + last.count= 0; } CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */ end_loop->first_elem_disp, end_loop->size, end_loop->common.flags ); - pElemDesc++; nbElems++; if( --stack_pos >= 0 ) { /* still something to do ? */ ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop); - pStartLoop->items = end_loop->items; + pStartLoop->items = pElemDesc->end_loop.items; total_disp = pStack->disp; /* update the displacement position */ } + pElemDesc++; nbElems++; pStack--; /* go down one position on the stack */ pos_desc++; continue; } if( OPAL_DATATYPE_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { ddt_loop_desc_t* loop = (ddt_loop_desc_t*)&(pData->desc.desc[pos_desc]); - ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]); int index = GET_FIRST_NON_LOOP( &(pData->desc.desc[pos_desc]) ); - ptrdiff_t loop_disp = pData->desc.desc[pos_desc + index].elem.disp; - continuity = ((last_disp + (ptrdiff_t)last_length * (ptrdiff_t)opal_datatype_basicDatatypes[last_type]->size) - == (total_disp + loop_disp)); if( loop->common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - /* the loop is contiguous or composed by contiguous elements with a gap */ - if( loop->extent == (ptrdiff_t)end_loop->size ) { - /* the whole loop is contiguous */ - if( !continuity ) { - if( 0 != last_length ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, - last_length, last_disp, last_extent ); - pElemDesc++; nbElems++; - last_length = 0; - } - last_disp = total_disp + loop_disp; - } - last_length = (last_length * opal_datatype_basicDatatypes[last_type]->size - + loop->loops * end_loop->size); - last_type = OPAL_DATATYPE_UINT1; - last_extent = 1; - } else { - int counter = loop->loops; - ptrdiff_t merged_disp = 0; - /* if the previous data is contiguous with this piece and it has a length not ZERO */ - if( last_length != 0 ) { - if( continuity ) { - last_length *= opal_datatype_basicDatatypes[last_type]->size; - last_length += end_loop->size; - last_type = OPAL_DATATYPE_UINT1; - last_extent = 1; - counter--; - merged_disp = loop->extent; /* merged loop, update the disp of the remaining elems */ - } - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, - last_length, last_disp, last_extent ); - pElemDesc++; nbElems++; - last_disp += last_length; - last_length = 0; - last_type = OPAL_DATATYPE_LOOP; - } - /** - * The content of the loop is contiguous (maybe with a gap before or after). - * - * If any of the loops have been merged with the previous element, then the - * displacement of the first element (or the displacement of all elements if the - * loop will be removed) must be updated accordingly. - */ - if( counter <= 2 ) { - merged_disp += end_loop->first_elem_disp; - while( counter > 0 ) { - CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC, - end_loop->size, merged_disp, 1); - pElemDesc++; nbElems++; counter--; - merged_disp += loop->extent; - } - } else { - CREATE_LOOP_START( pElemDesc, counter, 2, loop->extent, loop->common.flags ); - pElemDesc++; nbElems++; - CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC, - end_loop->size, loop_disp, 1); - pElemDesc++; nbElems++; - CREATE_LOOP_END( pElemDesc, 2, end_loop->first_elem_disp + merged_disp, - end_loop->size, end_loop->common.flags ); - pElemDesc++; nbElems++; + ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]); + + assert(pData->desc.desc[pos_desc + index].elem.disp == end_loop->first_elem_disp); + compress.common.flags = loop->common.flags; + compress.common.type = pData->desc.desc[pos_desc + index].elem.common.type; + compress.blocklen = pData->desc.desc[pos_desc + index].elem.blocklen; + for( uint32_t i = index+1; i < loop->items; i++ ) { + current = &pData->desc.desc[pos_desc + i].elem; + assert(1 == current->count); + if( (current->common.type == OPAL_DATATYPE_LOOP) || + compress.common.type != current->common.type ) { + compress.common.type = OPAL_DATATYPE_UINT1; + compress.blocklen = end_loop->size; + break; } + compress.blocklen += current->blocklen; } - pos_desc += loop->items + 1; - } else { - ddt_elem_desc_t* elem = (ddt_elem_desc_t*)&(pData->desc.desc[pos_desc+1]); - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); - pElemDesc++; nbElems++; - last_disp += last_length; - last_length = 0; - last_type = OPAL_DATATYPE_LOOP; + compress.count = loop->loops; + compress.extent = loop->extent; + compress.disp = end_loop->first_elem_disp; + if( compress.extent == (ptrdiff_t)(compress.blocklen * opal_datatype_basicDatatypes[compress.common.type]->size) ) { + /* The compressed element is contiguous: collapse it into a single large blocklen */ + compress.blocklen *= compress.count; + compress.extent *= compress.count; + compress.count = 1; } - if( 2 == loop->items ) { /* small loop */ - if( (1 == elem->count) - && (elem->extent == (ptrdiff_t)opal_datatype_basicDatatypes[elem->common.type]->size) ) { - CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags & ~OPAL_DATATYPE_FLAG_CONTIGUOUS, - loop->loops, elem->disp, loop->extent ); + /** + * The current loop has been compressed and can now be treated as if it + * was a data element. We can now look if it can be fused with last, + * as done in the fusion of 2 elements below. Let's use the same code. + */ + pos_desc += loop->items + 1; + current = &compress; + goto fuse_loops; + } + + /** + * If the content of the loop is not contiguous there is little we can do + * that would not incur significant optimization cost and still be beneficial + * in reducing the number of memcpy during pack/unpack. + */ + + if( 0 != last.count ) { /* Generate the pending element */ + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); + pElemDesc++; nbElems++; + last.count = 0; + last.common.type = OPAL_DATATYPE_LOOP; + } + + /* Can we unroll the loop? */ + if( (loop->items <= 3) && (loop->loops <= 2) ) { + ptrdiff_t elem_displ = 0; + for( uint32_t i = 0; i < loop->loops; i++ ) { + for( uint32_t j = 0; j < (loop->items - 1); j++ ) { + current = &pData->desc.desc[pos_desc + index + j].elem; + CREATE_ELEM( pElemDesc, current->common.type, current->common.flags, + current->blocklen, current->count, current->disp + elem_displ, current->extent ); pElemDesc++; nbElems++; - pos_desc += loop->items + 1; - goto complete_loop; - } else if( loop->loops < 3 ) { - ptrdiff_t elem_displ = elem->disp; - for( i = 0; i < loop->loops; i++ ) { - CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags, - elem->count, elem_displ, elem->extent ); - elem_displ += loop->extent; - pElemDesc++; nbElems++; - } - pos_desc += loop->items + 1; - goto complete_loop; } + elem_displ += loop->extent; } - CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); - pElemDesc++; nbElems++; - PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp ); - pos_desc++; - DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" ); + pos_desc += loop->items + 1; + goto complete_loop; } + + CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); + pElemDesc++; nbElems++; + PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp ); + pos_desc++; + DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" ); + complete_loop: total_disp = pStack->disp; /* update the displacement */ continue; } - while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* keep doing it until we reach a non datatype element */ - /* now here we have a basic datatype */ - type = pData->desc.desc[pos_desc].elem.common.type; - continuity = ((last_disp + (ptrdiff_t)last_length * (ptrdiff_t)opal_datatype_basicDatatypes[last_type]->size) - == (total_disp + pData->desc.desc[pos_desc].elem.disp)); + while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* go over all basic datatype elements */ + current = &pData->desc.desc[pos_desc].elem; + pos_desc++; /* point to the next element as current points to the current one */ - if( (pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && continuity && - (pData->desc.desc[pos_desc].elem.extent == (int32_t)opal_datatype_basicDatatypes[type]->size) ) { - if( type == last_type ) { - last_length += pData->desc.desc[pos_desc].elem.count; - last_extent = pData->desc.desc[pos_desc].elem.extent; - } else { - if( last_length == 0 ) { - last_type = type; - last_length = pData->desc.desc[pos_desc].elem.count; - last_extent = pData->desc.desc[pos_desc].elem.extent; - } else { - last_length = last_length * opal_datatype_basicDatatypes[last_type]->size + - pData->desc.desc[pos_desc].elem.count * opal_datatype_basicDatatypes[type]->size; - last_type = OPAL_DATATYPE_UINT1; - last_extent = 1; + fuse_loops: + if( 0 == last.count ) { /* first data of the datatype */ + last = *current; + continue; /* next data */ + } + + /* are the two elements compatible: aka they have very similar values and they + * can be merged together by increasing the count, and/or changing the extent. + */ + if( (last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == + (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size) ) { + ddt_elem_desc_t save = last; /* safekeep the type and blocklen */ + if( last.common.type != current->common.type ) { + last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; + last.common.type = OPAL_DATATYPE_UINT1; + } + + if( 1 == last.count ) { + /* we can ignore the extent of the element with count == 1 and merge them together if their displacements match */ + if( 1 == current->count ) { + last.extent = current->disp - last.disp; + last.count++; + continue; } + /* can we compute a matching displacement ? */ + if( (last.disp + current->extent) == current->disp ) { + last.extent = current->extent; + last.count = current->count + 1; + continue; + } + } + if( (last.extent * (ptrdiff_t)last.count + last.disp) == current->disp ) { + if( 1 == current->count ) { + last.count++; + continue; + } + if( last.extent == current->extent ) { + last.count += current->count; + continue; + } + } + last.blocklen = save.blocklen; + last.common.type = save.common.type; + /* try other optimizations */ + } + /* are the elements fusionable such that we can fusion the last blocklen of one with the first + * blocklen of the other. + */ + if( (ptrdiff_t)(last.disp + (last.count - 1) * last.extent + last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == + current->disp ) { + if( last.count != 1 ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count - 1, last.disp, last.extent ); + pElemDesc++; nbElems++; + last.disp += (last.count - 1) * last.extent; + last.count = 1; } - last_flags &= pData->desc.desc[pos_desc].elem.common.flags; - } else { - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); + if( last.common.type == current->common.type ) { + last.blocklen += current->blocklen; + } else { + last.blocklen = ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) + + (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)); + last.common.type = OPAL_DATATYPE_UINT1; + } + last.extent += current->extent; + if( current->count != 1 ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; + last = *current; + last.count -= 1; + last.disp += last.extent; } - last_disp = total_disp + pData->desc.desc[pos_desc].elem.disp; - last_length = pData->desc.desc[pos_desc].elem.count; - last_extent = pData->desc.desc[pos_desc].elem.extent; - last_type = type; + continue; } - pos_desc++; /* advance to the next data */ + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); + pElemDesc++; nbElems++; + last = *current; } } - if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); + if( 0 != last.count ) { + CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC, + last.blocklen, last.count, last.disp, last.extent ); pElemDesc++; nbElems++; } /* cleanup the stack */ diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index 55889fcaa55..f21adcccb34 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -31,7 +31,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_pack_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_pack_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ @@ -53,8 +53,6 @@ #endif /* defined(CHECKSUM) */ -#define IOVEC_MEM_LIMIT 8192 - /* the contig versions does not use the stack. They can easily retrieve * the status with just the informations from pConvertor->bConverted. */ @@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv, unsigned char *source_base = NULL; uint32_t iov_count; size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; - source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp); + source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp); /* There are some optimizations that can be done if the upper level * does not provide a buffer. @@ -111,155 +108,116 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv, uint32_t* out_size, size_t* max_data ) { + size_t remaining, length, initial_bytes_converted = pConv->bConverted; const opal_datatype_t* pData = pConv->pDesc; dt_stack_t* stack = pConv->pStack; + ptrdiff_t extent = pData->ub - pData->lb; unsigned char *user_memory, *packed_buffer; - uint32_t iov_count, index; + uint32_t idx; size_t i; - size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; - ptrdiff_t extent= pData->ub - pData->lb; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; + /* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb + * is the initial displacement, the size the length of the contiguous area and the extent represent + * how much we should jump between elements. + */ assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) ); DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", (void*)pConv->pBaseBuf, *out_size ); ); if( stack[1].type != opal_datatype_uint1.id ) { stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; - stack[1].type = opal_datatype_uint1.id; + stack[1].type = opal_datatype_uint1.id; + } + /* We can provide directly the pointers in the user buffers (like the convertor_raw) */ + if( NULL == iov[0].iov_base ) { + user_memory = pConv->pBaseBuf + pData->true_lb; + + for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) { + iov[idx].iov_base = user_memory + stack[0].disp + stack[1].disp; + iov[idx].iov_len = stack[1].count; + COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv ); + + pConv->bConverted += stack[1].count; + + stack[0].disp += extent; + stack[0].count--; + stack[1].disp = 0; + stack[1].count = pData->size; /* we might need this to update the partial + * length for the first iteration */ + } + goto update_status_and_return; } - /* There are some optimizations that can be done if the upper level - * does not provide a buffer. - */ - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { + for( idx = 0; idx < (*out_size); idx++ ) { /* Limit the amount of packed data to the data left over on this convertor */ remaining = pConv->local_size - pConv->bConverted; if( 0 == remaining ) break; /* we're done this time */ - if( remaining > iov[iov_count].iov_len ) - remaining = iov[iov_count].iov_len; - packed_buffer = (unsigned char *)iov[iov_count].iov_base; - bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp; - i = pConv->count - stack[0].count; /* how many we already packed */ - assert(i == (pConv->bConverted / pData->size)); - - if( packed_buffer == NULL ) { - /* special case for small data. We avoid allocating memory if we - * can fill the iovec directly with the address of the remaining - * data. - */ - if( stack->count < (size_t)((*out_size) - iov_count) ) { - stack[1].count = pData->size - (pConv->bConverted % pData->size); - for( index = iov_count; i < pConv->count; i++, index++ ) { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = stack[1].count; - stack[0].disp += extent; - pConv->bConverted += stack[1].count; - stack[1].disp = 0; /* reset it for the next round */ - stack[1].count = pData->size; - user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp; - COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); - } - *out_size = iov_count + index; - *max_data = (pConv->bConverted - initial_bytes_converted); - pConv->flags |= CONVERTOR_COMPLETED; - return 1; /* we're done */ - } - /* now special case for big contiguous data with gaps around */ - if( pData->size >= IOVEC_MEM_LIMIT ) { - /* as we dont have to copy any data, we can simply fill the iovecs - * with data from the user data description. - */ - for( index = iov_count; (i < pConv->count) && (index < (*out_size)); - i++, index++ ) { - if( remaining < pData->size ) { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = remaining; - remaining = 0; - COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); - break; - } else { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = pData->size; - user_memory += extent; - COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv ); - } - remaining -= iov[index].iov_len; - pConv->bConverted += iov[index].iov_len; - } - *out_size = index; - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + if( remaining > iov[idx].iov_len ) + remaining = iov[idx].iov_len; + packed_buffer = (unsigned char *)iov[idx].iov_base; + pConv->bConverted += remaining; + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; + + DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, remaining ); ); + + length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ + /* data left from last round and enough space in the buffer */ + if( (pData->size != length) && (length <= remaining)) { + /* copy the partial left-over from the previous round */ + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n", + (void*)user_memory, (void*)packed_buffer, length ); ); + MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); + packed_buffer += length; + remaining -= length; + stack[1].count -= length; + stack[1].disp += length; /* just in case, we overwrite this below */ + if( 0 == stack[1].count) { /* one completed element */ + stack[0].count--; + stack[0].disp += extent; + if( 0 == stack[0].count ) /* not yet done */ + break; + stack[1].count = pData->size; + stack[1].disp = 0; } + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; } - { - DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); - - length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ - /* data left from last round and enough space in the buffer */ - if( (0 != length) && (length <= remaining)) { - /* copy the partial left-over from the previous round */ - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); - MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); - packed_buffer += length; - user_memory += (extent - pData->size + length); - remaining -= length; - stack[1].count -= length; - if( 0 == stack[1].count) { /* one completed element */ - stack[0].count--; - stack[0].disp += extent; - if( 0 != stack[0].count ) { /* not yet done */ - stack[1].count = pData->size; - stack[1].disp = 0; - } - } - } - for( i = 0; pData->size <= remaining; i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); - MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); - packed_buffer += pData->size; - user_memory += extent; - remaining -= pData->size; - } - stack[0].count -= i; /* the filled up and the entire types */ - stack[0].disp += (i * extent); - stack[1].disp += remaining; - /* Copy the last bits */ - if( 0 != remaining ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); - MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); - user_memory += remaining; - stack[1].count -= remaining; - } + for( i = 0; pData->size <= remaining; i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); ); + MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); + packed_buffer += pData->size; + user_memory += extent; + remaining -= pData->size; + } + stack[0].count -= i; /* the entire datatype copied above */ + stack[0].disp += (i * extent); + + /* Copy the last bits */ + if( 0 != remaining ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, remaining ); ); + MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); + stack[1].count -= remaining; + stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */ if( 0 == stack[1].count ) { /* prepare for the next element */ stack[1].count = pData->size; stack[1].disp = 0; } } - pConv->bConverted += bConverted; - } - *out_size = iov_count; - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; } - return 0; + + update_status_and_return: + *out_size = idx; + *max_data = pConv->bConverted - initial_bytes_converted; + if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; + return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ } /* The pack/unpack functions need a cleanup. I have to create a proper interface to access @@ -314,18 +272,32 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - /* now here we have a basic datatype */ - PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - conv_ptr, iov_ptr, iov_len_local ); - if( 0 == count_desc ) { /* completed */ + + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + /* we have a partial (less than blocklen) basic datatype */ + int rc = PACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + if( 0 == rc ) /* not done */ + goto complete_loop; + if( 0 == count_desc ) { conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - continue; } - goto complete_loop; + } + } + + while( 1 ) { + while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + /* we have a basic datatype (working on full blocks) */ + PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + if( 0 != count_desc ) /* completed? */ + goto complete_loop; + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d" diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index f952cabc3c0..1eaf2e8b9f9 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -19,8 +19,6 @@ #include "opal_config.h" -#include - #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT /* Make use of existing macro to do CUDA style memcpy */ #undef MEMCPY_CSUM @@ -28,88 +26,181 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif -static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, - const dt_elem_desc_t* ELEM, - size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, - size_t* SPACE ) +/** + * This function deals only with partial elements. The COUNT points however to the whole leftover count, + * but this function is only expected to operate on an amount less than blength, that would allow the rest + * of the pack process to handle only entire blength blocks (plus the left over). + * + * Return 1 if we are now aligned on a block, 0 otherwise. + */ +static inline int +pack_partial_blocklen( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** memory, + unsigned char** packed, + size_t* SPACE ) { - size_t _copy_count = *(COUNT); - size_t _copy_blength; const ddt_elem_desc_t* _elem = &((ELEM)->elem); - unsigned char* _source = (*SOURCE) + _elem->disp; + size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now = *(COUNT); + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + if( 0 == (do_now = (*COUNT) % _elem->blocklen) ) + return 1; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_count * _copy_blength) > *(SPACE) ) { - _copy_count = (*(SPACE) / _copy_blength); - if( 0 == _copy_count ) return; /* nothing to do */ + size_t left_in_block = do_now; /* left in the current blocklen */ + + if( (do_now_bytes * do_now) > *(SPACE) ) + do_now = (*SPACE) / do_now_bytes; + + do_now_bytes *= do_now; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [partial]\n", + _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); + *(memory) += (ptrdiff_t)do_now_bytes; + if( do_now == left_in_block ) /* compensate if completed a blocklen */ + *(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + + *(COUNT) -= do_now; + *(SPACE) -= do_now_bytes; + *(packed) += do_now_bytes; + return (do_now == left_in_block); +} + +/** + * Pack entire blocks, plus a possible remainder if SPACE is constrained to less than COUNT elements. + */ +static inline void +pack_predefined_data( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** memory, + unsigned char** packed, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t cando_count = *(COUNT), do_now_bytes; + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */ + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + if( (blocklen_bytes * cando_count) > *(SPACE) ) + cando_count = (*SPACE) / blocklen_bytes; + + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ + for(; cando_count > 0; cando_count--) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + } + goto update_and_return; } - if( (ptrdiff_t)_copy_blength == _elem->extent ) { - _copy_blength *= _copy_count; - /* the extent and the size of the basic datatype are equal */ - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); - _source += _copy_blength; - *(DESTINATION) += _copy_blength; - } else { - for(size_t _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); + if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) { + blocklen_bytes *= _elem->blocklen; + + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); - *(DESTINATION) += _copy_blength; - _source += _elem->extent; - } - _copy_blength *= _copy_count; + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); } - *(SOURCE) = _source - _elem->disp; - *(SPACE) -= _copy_blength; - *(COUNT) -= _copy_count; + + /** + * As an epilog do anything left from the last blocklen. + */ + if( 0 != cando_count ) { + assert( (cando_count < _elem->blocklen) || + ((1 == _elem->count) && (cando_count <= _elem->blocklen)) ); + do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); + _memory += do_now_bytes; + _packed += do_now_bytes; + } + + update_and_return: + *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, + unsigned char** memory, + unsigned char** packed, size_t* SPACE ) { const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); - unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp; + unsigned char* _memory = (*memory) + _end_loop->first_elem_disp; size_t _copy_loops = *(COUNT); if( (_copy_loops * _end_loop->size) > *(SPACE) ) _copy_loops = (*(SPACE) / _end_loop->size); for(size_t _i = 0; _i < _copy_loops; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _end_loop->size, (CONVERTOR)->pBaseBuf, + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, _end_loop->size, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) ); - *(DESTINATION) += _end_loop->size; - _source += _loop->extent; + (void*)*(packed), (void*)_memory, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); + MEMCPY_CSUM( *(packed), _memory, _end_loop->size, (CONVERTOR) ); + *(packed) += _end_loop->size; + _memory += _loop->extent; } - *(SOURCE) = _source - _end_loop->first_elem_disp; + *(memory) = _memory - _end_loop->first_elem_disp; *(SPACE) -= _copy_loops * _end_loop->size; *(COUNT) -= _copy_loops; } -#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ +#define PACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + MEMORY, /* the source pointer (char*) */ \ + PACKED, /* the destination pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +pack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) + +#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ ELEM, /* the basic element to be packed */ \ COUNT, /* the number of elements */ \ - SOURCE, /* the source pointer (char*) */ \ - DESTINATION, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + PACKED, /* the destination pointer (char*) */ \ SPACE ) /* the space in the destination buffer */ \ -pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) -#define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ - pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +#define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, MEMORY, PACKED, SPACE ) \ + pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) #endif /* OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index 3b8eaec69c6..02ec55651a0 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -33,7 +33,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_position_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_position_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ @@ -49,10 +49,24 @@ * - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless. */ +static inline void +position_single_block(opal_convertor_t* CONVERTOR, + unsigned char** mem, ptrdiff_t mem_update, + size_t* space, size_t space_update, + size_t* cnt, size_t cnt_update) +{ + OPAL_DATATYPE_SAFEGUARD_POINTER( *mem, mem_update, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n", + (void*)*mem, (unsigned long)space_update, (unsigned long)(*space) ); ); + *mem += mem_update; + *space -= space_update; + *cnt -= cnt_update; +} + /** - * Advance the current position in the convertor based using the - * current element and a left-over counter. Update the head pointer - * and the leftover byte space. + * Advance the convertors' position according. Update the pointer and the remaining space + * accordingly. */ static inline void position_predefined_data( opal_convertor_t* CONVERTOR, @@ -61,55 +75,81 @@ position_predefined_data( opal_convertor_t* CONVERTOR, unsigned char** POINTER, size_t* SPACE ) { - size_t _copy_count = *(COUNT); - size_t _copy_blength; - ddt_elem_desc_t* _elem = &((ELEM)->elem); + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t total_count = _elem->count * _elem->blocklen; + size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now, do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + unsigned char* _memory = (*POINTER) + _elem->disp; + + assert( *(COUNT) <= _elem->count * _elem->blocklen); - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_count * _copy_blength) > *(SPACE) ) { - _copy_count = *(SPACE) / _copy_blength; - if( 0 == _copy_count ) return; /* nothing to do */ + if( cando_count > *(COUNT) ) + cando_count = *(COUNT); + + if( 1 == _elem->blocklen ) { + DO_DEBUG( opal_output( 0, "position( %p, %" PRIsize_t " ) x (count %" PRIsize_t ", extent %ld) => space %lu [prolog]\n", + (void*)_memory, (unsigned long)do_now_bytes, cando_count, _elem->extent, (unsigned long)(*SPACE) ); ); + _memory += cando_count * _elem->extent; + *SPACE -= cando_count * do_now_bytes; + *COUNT -= cando_count; + goto update_and_return; } - _copy_blength *= _copy_count; - OPAL_DATATYPE_SAFEGUARD_POINTER( *(POINTER) + _elem->disp, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - *(POINTER) += (_copy_count * _elem->extent); - *(SPACE) -= _copy_blength; - *(COUNT) -= _copy_count; -} + /** + * First check if we already did something on this element ? + */ + do_now = (total_count - *(COUNT)); /* done elements */ + if( 0 != do_now ) { + do_now = do_now % _elem->blocklen; /* partial blocklen? */ -/** - * Advance the current position in the convertor based using the - * current contiguous loop and a left-over counter. Update the head - * pointer and the leftover byte space. - */ -static inline void -position_contiguous_loop( opal_convertor_t* CONVERTOR, - dt_elem_desc_t* ELEM, - size_t* COUNT, - unsigned char** POINTER, - size_t* SPACE ) -{ - ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); - ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + (ELEM)->loop.items); - size_t _copy_loops = *(COUNT); + if( 0 != do_now ) { + size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ + do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_loops * _end_loop->size) > *(SPACE) ) - _copy_loops = *(SPACE) / _end_loop->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( *(POINTER) + _end_loop->first_elem_disp, - (_copy_loops - 1) * _loop->extent + _end_loop->size, - (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - *(POINTER) += _copy_loops * _loop->extent; - *(SPACE) -= _copy_loops * _end_loop->size; - *(COUNT) -= _copy_loops; -} + position_single_block( CONVERTOR, &_memory, do_now_bytes, + SPACE, do_now_bytes, COUNT, do_now ); -#define POSITION_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, POSITION, SPACE ) \ - position_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(POSITION), &(SPACE) ) + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + cando_count -= do_now; + } + } + + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = cando_count / _elem->blocklen; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; +#if OPAL_ENABLE_DEBUG + for(size_t _i = 0; _i < do_now; _i++ ) { + position_single_block( CONVERTOR, &_memory, _elem->extent, + SPACE, do_now_bytes, COUNT, _elem->blocklen ); + cando_count -= _elem->blocklen; + } +#else + _memory += do_now * _elem->extent; + *SPACE -= do_now * do_now_bytes; + *COUNT -= do_now * _elem->blocklen; + cando_count -= do_now * _elem->blocklen; +#endif /* OPAL_ENABLE_DEBUG */ + } + + /** + * As an epilog do anything left from the last blocklen. + */ + do_now = cando_count; + if( 0 != do_now ) { + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + position_single_block( CONVERTOR, &_memory, do_now_bytes, + SPACE, do_now_bytes, COUNT, do_now ); + } -#define POSITION_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, POSITION, SPACE ) \ - position_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(POSITION), &(SPACE) ) + update_and_return: + *(POINTER) = _memory - _elem->disp; +} int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, size_t* position ) @@ -117,10 +157,10 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, dt_stack_t* pStack; /* pointer to the position on the stack */ uint32_t pos_desc; /* actual position in the description of the derived datatype */ size_t count_desc; /* the number of items already done in the actual pos_desc */ + size_t iov_len_local; dt_elem_desc_t* description = pConvertor->use_desc->desc; dt_elem_desc_t* pElem; /* current position */ unsigned char *base_pointer = pConvertor->pBaseBuf; - size_t iov_len_local; ptrdiff_t extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb; DUMP( "opal_convertor_generic_simple_position( %p, &%ld )\n", (void*)pConvertor, (long)*position ); @@ -128,8 +168,8 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, /* We dont want to have to parse the datatype multiple times. What we are interested in * here is to compute the number of completed datatypes that we can move forward, update - * the counters and finally compute the position taking in account only the remaining - * elements. The only problem is that we have to modify all the elements on the stack. + * the counters and compute the position taking in account only the remaining elements. + * The only problem is that we have to modify all the elements on the stack. */ iov_len_local = *position - pConvertor->bConverted; if( iov_len_local > pConvertor->pDesc->size ) { @@ -171,21 +211,19 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, assert(pConvertor->partial_length < element_length); return 0; } - pConvertor->partial_length = (pConvertor->partial_length + missing_length) % element_length; - assert(pConvertor->partial_length == 0); + pConvertor->partial_length = 0; pConvertor->bConverted += missing_length; iov_len_local -= missing_length; count_desc--; } while( 1 ) { - if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ + if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the the entire datatype */ DO_DEBUG( opal_output( 0, "position end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %lx space %lu\n", pStack->count, pConvertor->stack_pos, pos_desc, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( pConvertor->stack_pos == 0 ) { pConvertor->flags |= CONVERTOR_COMPLETED; - pConvertor->partial_length = 0; goto complete_loop; /* completed */ } pConvertor->stack_pos--; @@ -194,11 +232,13 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, } else { if( pStack->index == -1 ) { pStack->disp += extent; + pos_desc = 0; /* back to the first element */ } else { assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); pStack->disp += description[pStack->index].loop.extent; + pos_desc = pStack->index; /* go back to the loop start itself to give a chance + * to move forward by entire loops */ } - pos_desc = pStack->index + 1; } base_pointer = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); @@ -208,9 +248,14 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, } if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { ptrdiff_t local_disp = (ptrdiff_t)base_pointer; - if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - POSITION_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, - base_pointer, iov_len_local ); + ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)(pElem + pElem->loop.items); + size_t full_loops = iov_len_local / end_loop->size; + full_loops = count_desc <= full_loops ? count_desc : full_loops; + if( full_loops ) { + base_pointer += full_loops * pElem->loop.extent; + iov_len_local -= full_loops * end_loop->size; + count_desc -= full_loops; + if( 0 == count_desc ) { /* completed */ pos_desc += pElem->loop.items + 1; goto update_loop_description; @@ -232,8 +277,7 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, } while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ - POSITION_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - base_pointer, iov_len_local ); + position_predefined_data( pConvertor, pElem, &count_desc, &base_pointer, &iov_len_local ); if( 0 != count_desc ) { /* completed */ pConvertor->partial_length = iov_len_local; goto complete_loop; diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index 3edb9161923..0925bde736d 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -33,7 +33,7 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" -#define DO_DEBUG(INST) if( opal_unpack_debug ) { INST } +#define DO_DEBUG(INST) if( opal_ddt_unpack_debug ) { INST } #else #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ @@ -70,98 +70,82 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, { const opal_datatype_t *pData = pConv->pDesc; unsigned char *user_memory, *packed_buffer; - uint32_t iov_count, i; - size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; + uint32_t iov_idx, i; + size_t remaining, initial_bytes_converted = pConv->bConverted; dt_stack_t* stack = pConv->pStack; ptrdiff_t extent = pData->ub - pData->lb; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", + DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov count %d )\n", (void*)pConv->pBaseBuf, *out_size ); ); if( stack[1].type != opal_datatype_uint1.id ) { stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].type = opal_datatype_uint1.id; } - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { - remaining = pConv->local_size - pConv->bConverted; - if( 0 == remaining ) break; /* we're done this time */ - if( remaining > iov[iov_count].iov_len ) - remaining = iov[iov_count].iov_len; - packed_buffer = (unsigned char*)iov[iov_count].iov_base; - bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + initial_displ; - - if( (ptrdiff_t)pData->size == extent ) { - user_memory += pConv->bConverted; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + + if( (ptrdiff_t)pData->size == extent ) { + for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) { + remaining = pConv->local_size - pConv->bConverted; + if( 0 == remaining ) break; /* we're done this time */ + if( remaining > iov[iov_idx].iov_len ) + remaining = iov[iov_idx].iov_len; + + packed_buffer = (unsigned char*)iov[iov_idx].iov_base; + user_memory = pConv->pBaseBuf + pData->true_lb + pConv->bConverted; /* contiguous data or basic datatype with count */ OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "1. unpack contig dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + DO_DEBUG( opal_output( 0, "unpack contig [%d] dest %p src %p length %" PRIsize_t "\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); - } else { - user_memory += stack[0].disp + stack[1].disp; + pConv->bConverted += remaining; /* how much will get unpacked this time */ + } + } else { + for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) { + remaining = pConv->local_size - pConv->bConverted; + if( 0 == remaining ) break; /* we're done this time */ + if( remaining > iov[iov_idx].iov_len ) + remaining = iov[iov_idx].iov_len; + + packed_buffer = (unsigned char*)iov[iov_idx].iov_base; + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; + pConv->bConverted += remaining; /* how much will get unpacked this time */ + + for( i = 0; stack[1].count <= remaining; i++ ) { /* partial or full data */ + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, stack[1].count, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [%d]\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, stack[1].count, i ); ); + MEMCPY_CSUM( user_memory, packed_buffer, stack[1].count, pConv ); - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + packed_buffer += stack[1].count; + remaining -= stack[1].count; - length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last unpack */ - /* complete the last copy */ - if( (0 != length) && (length <= remaining) ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); - MEMCPY_CSUM( user_memory, packed_buffer, length, pConv ); - packed_buffer += length; - user_memory += (extent - (pData->size - length)); - remaining -= length; - stack[1].count -= length; - if( 0 == stack[1].count) { /* one completed element */ - stack[0].count--; - stack[0].disp += extent; - if( 0 != stack[0].count ) { /* not yet done */ - stack[1].count = pData->size; - stack[1].disp = 0; - } - } - } - for( i = 0; pData->size <= remaining; i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); - MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv ); - packed_buffer += pData->size; - user_memory += extent; - remaining -= pData->size; + stack[0].count--; + stack[0].disp += extent; + stack[1].count = pData->size; + stack[1].disp = 0; + + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp; } - stack[0].count -= i; - stack[0].disp += (i * extent); - stack[1].disp += remaining; - /* copy the last bits */ + + /* Copy the last bits */ if( 0 != remaining ) { OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [epilog]\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); - user_memory += remaining; stack[1].count -= remaining; + stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */ + assert( stack[1].count ); } } - pConv->bConverted += bConverted; - } - *out_size = iov_count; /* we only reach this line after the for loop succesfully complete */ - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; } - return 0; + *out_size = iov_idx; /* we only reach this line after the for loop succesfully complete */ + *max_data = pConv->bConverted - initial_bytes_converted; + if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; + return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ } /** @@ -179,7 +163,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, static inline void opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem, unsigned char* partial_data, - ptrdiff_t start_position, ptrdiff_t length, + ptrdiff_t start_position, size_t length, unsigned char** user_buffer ) { char unused_byte = 0x7F, saved_data[16]; @@ -195,7 +179,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle /* Find a byte that is not used in the partial buffer */ find_unused_byte: - for(ptrdiff_t i = 0; i < length; i++ ) { + for(size_t i = 0; i < length; i++ ) { if( unused_byte == partial_data[i] ) { unused_byte--; goto find_unused_byte; @@ -298,6 +282,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; + if( 0 != pConvertor->partial_length ) { size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; size_t missing_length = element_length - pConvertor->partial_length; @@ -306,7 +291,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, COMPUTE_CSUM( iov_ptr, missing_length, pConvertor ); opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, - pConvertor->partial_length, element_length - pConvertor->partial_length, + pConvertor->partial_length, (size_t)(element_length - pConvertor->partial_length), &conv_ptr ); --count_desc; if( 0 == count_desc ) { @@ -318,34 +303,31 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, iov_len_local -= missing_length; pConvertor->partial_length = 0; /* nothing more inside */ } - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - /* now here we have a basic datatype */ - UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - iov_ptr, conv_ptr, iov_len_local ); - if( 0 == count_desc ) { /* completed */ + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + /* we have a partial (less than blocklen) basic datatype */ + int rc = UNPACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, + iov_ptr, conv_ptr, iov_len_local ); + if( 0 == rc ) /* not done */ + goto complete_loop; + if( 0 == count_desc ) { conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - continue; } - assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { - unsigned char* temp = conv_ptr; - /* We have some partial data here. Let's copy it into the convertor - * and keep it hot until the next round. - */ - assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); - COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); - - opal_unpack_partial_datatype( pConvertor, pElem, - iov_ptr, 0, iov_len_local, - &temp ); - - pConvertor->partial_length = iov_len_local; - iov_len_local = 0; - } - goto complete_loop; + } + } + + while( 1 ) { + while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + /* we have a basic datatype (working on full blocks) */ + UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + iov_ptr, conv_ptr, iov_len_local ); + if( 0 != count_desc ) /* completed? */ + goto complete_loop; + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n", @@ -353,11 +335,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( 0 == pConvertor->stack_pos ) { - /* Do the same thing as when the loop is completed */ - iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ - total_unpacked += iov[iov_count].iov_len; - iov_count++; /* go to the next */ - goto complete_conversion; + /* we're done. Force the exit of the main for loop (around iovec) */ + *out_size = iov_count; + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -396,14 +376,29 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, conv_ptr = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); - continue; } } complete_loop: + assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); + if( 0 != iov_len_local ) { + unsigned char* temp = conv_ptr; + /* We have some partial data here. Let's copy it into the convertor + * and keep it hot until the next round. + */ + assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); + COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); + + opal_unpack_partial_datatype( pConvertor, pElem, + iov_ptr, 0, iov_len_local, + &temp ); + + pConvertor->partial_length = iov_len_local; + iov_len_local = 0; + } + iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ total_unpacked += iov[iov_count].iov_len; } - complete_conversion: *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; @@ -530,11 +525,9 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( 0 == pConvertor->stack_pos ) { - /* Do the same thing as when the loop is completed */ - iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ - total_unpacked += iov[iov_count].iov_len; - iov_count++; /* go to the next */ - goto complete_conversion; + /* we're done. Force the exit of the main for loop (around iovec) */ + *out_size = iov_count; + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -568,7 +561,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ total_unpacked += iov[iov_count].iov_len; } - complete_conversion: *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index d837aad5ab7..db5b58fd3c3 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -26,84 +26,178 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif +/** + * This function deals only with partial elements. The COUNT points however to the whole leftover count, + * but this function is only expected to operate on an amount less than blength, that would allow the rest + * of the pack process to handle only entire blength blocks (plus the left over). + * + * Return 1 if we are now aligned on a block, 0 otherwise. + */ +static inline int +unpack_partial_blocklen( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** packed, + unsigned char** memory, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now = (*COUNT); + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( *(COUNT) <= (_elem->count * _elem->blocklen)); + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + if( 0 == (do_now = (*COUNT) % _elem->blocklen) ) + return 1; + + size_t left_in_block = do_now; /* left in the current blocklen */ + + if( (do_now_bytes * do_now) > *(SPACE) ) + do_now = (*SPACE) / do_now_bytes; + + do_now_bytes *= do_now; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); + *(memory) += (ptrdiff_t)do_now_bytes; + if( do_now == left_in_block ) /* compensate if completed a blocklen */ + *(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + + *(COUNT) -= do_now; + *(SPACE) -= do_now_bytes; + *(packed) += do_now_bytes; + return (do_now == left_in_block); +} + static inline void -unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */ - const dt_elem_desc_t* ELEM, /* the element description */ - size_t* COUNT, /* the number of elements */ - unsigned char** SOURCE, /* the source pointer */ - unsigned char** DESTINATION, /* the destination pointer */ - size_t* SPACE ) /* the space in the destination buffer */ +unpack_predefined_data( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** packed, + unsigned char** memory, + size_t* SPACE ) { - size_t _copy_count = *(COUNT); - size_t _copy_blength; const ddt_elem_desc_t* _elem = &((ELEM)->elem); - unsigned char* _destination = (*DESTINATION) + _elem->disp; + size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t cando_count = (*COUNT), do_now_bytes; + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */ + assert( *(COUNT) <= (_elem->count * _elem->blocklen)); + + if( (blocklen_bytes * cando_count) > *(SPACE) ) + cando_count = (*SPACE) / blocklen_bytes; - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; - if( (_copy_count * _copy_blength) > *(SPACE) ) { - _copy_count = (*(SPACE) / _copy_blength); - if( 0 == _copy_count ) return; /* nothing to do */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ + for(; cando_count > 0; cando_count--) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + } + goto update_and_return; } - if( (ptrdiff_t)_copy_blength == _elem->extent ) { - _copy_blength *= _copy_count; - /* the extent and the size of the basic datatype are equal */ - OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); - *(SOURCE) += _copy_blength; - _destination += _copy_blength; - } else { - for(size_t _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); + if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) { + blocklen_bytes *= _elem->blocklen; + + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); - *(SOURCE) += _copy_blength; - _destination += _elem->extent; - } - _copy_blength *= _copy_count; + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); + } + + /** + * As an epilog do anything left from the last blocklen. + */ + if( 0 != cando_count ) { + assert( (cando_count < _elem->blocklen) || + ((1 == _elem->count) && (cando_count <= _elem->blocklen)) ); + do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); + _memory += do_now_bytes; + _packed += do_now_bytes; } - (*DESTINATION) = _destination - _elem->disp; - *(SPACE) -= _copy_blength; - *(COUNT) -= _copy_count; + + update_and_return: + *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, + unsigned char** packed, + unsigned char** memory, size_t* SPACE ) { const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); - unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp; + unsigned char* _memory = (*memory) + _end_loop->first_elem_disp; size_t _copy_loops = *(COUNT); if( (_copy_loops * _end_loop->size) > *(SPACE) ) _copy_loops = (*(SPACE) / _end_loop->size); for(size_t _i = 0; _i < _copy_loops; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _end_loop->size, (CONVERTOR)->pBaseBuf, + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, _end_loop->size, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_destination, (void*)*(SOURCE), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _end_loop->size, (CONVERTOR) ); - *(SOURCE) += _end_loop->size; - _destination += _loop->extent; + (void*)_memory, (void*)*(packed), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); + MEMCPY_CSUM( _memory, *(packed), _end_loop->size, (CONVERTOR) ); + *(packed) += _end_loop->size; + _memory += _loop->extent; } - *(DESTINATION) = _destination - _end_loop->first_elem_disp; - *(SPACE) -= _copy_loops * _end_loop->size; - *(COUNT) -= _copy_loops; + *(memory) = _memory - _end_loop->first_elem_disp; + *(SPACE) -= _copy_loops * _end_loop->size; + *(COUNT) -= _copy_loops; } -#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ - unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +#define UNPACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) + +#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) -#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ - unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) +#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ + unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) #endif /* OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED */ diff --git a/test/datatype/ddt_raw.c b/test/datatype/ddt_raw.c index de35d6b83f4..bba285ceea0 100644 --- a/test/datatype/ddt_raw.c +++ b/test/datatype/ddt_raw.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -71,7 +71,7 @@ static int test_upper( unsigned int length ) iov_count = 5; max_data = 0; opal_convertor_raw( pConv, iov, &iov_count, &max_data ); - i -= max_data; + i -= max_data; } GET_TIME( end ); total_time = ELAPSED_TIME( start, end ); @@ -85,12 +85,12 @@ static int test_upper( unsigned int length ) } /** - * Conversion function. They deal with data-types in 3 ways, always making local copies. + * Conversion function. They deal with datatypes in 3 ways, always making local copies. * In order to allow performance testings, there are 3 functions: * - one copying directly from one memory location to another one using the - * data-type copy function. - * - one which use a 2 convertors created with the same data-type - * - and one using 2 convertors created from different data-types. + * datatype copy function. + * - one which use a 2 convertors created with the same datatype + * - and one using 2 convertors created from different datatypes. * */ static int local_copy_ddt_raw( ompi_datatype_t* pdt, int count, int iov_num ) @@ -114,13 +114,13 @@ static int local_copy_ddt_raw( ompi_datatype_t* pdt, int count, int iov_num ) GET_TIME( start ); while( 0 == opal_convertor_raw(convertor, iov, &iov_count, &max_data) ) { #if 0 - printf( "New raw extraction (iov_count = %d, max_data = %zu)\n", - iov_count, max_data ); - for( i = 0; i < iov_count; i++ ) { - printf( "\t{%p, %d}\n", iov[i].iov_base, iov[i].iov_len ); - } + printf( "New raw extraction (iov_count = %d, max_data = %zu)\n", + iov_count, max_data ); + for( i = 0; i < iov_count; i++ ) { + printf( "\t{%p, %d}\n", iov[i].iov_base, iov[i].iov_len ); + } #endif - remaining_length -= max_data; + remaining_length -= max_data; iov_count = iov_num; } remaining_length -= max_data; @@ -129,19 +129,23 @@ static int local_copy_ddt_raw( ompi_datatype_t* pdt, int count, int iov_num ) printf( "raw extraction in %ld microsec\n", total_time ); OBJ_RELEASE( convertor ); if( remaining_length != 0 ) { - printf( "Not all raw description was been extracted (%lu bytes missing)\n", - (unsigned long) remaining_length ); + printf( "Not all raw description was been extracted (%lu bytes missing)\n", + (unsigned long) remaining_length ); } free(iov); return OMPI_SUCCESS; } /** - * Main function. Call several tests and print-out the results. It try to stress the convertor - * using difficult data-type constructions as well as strange segment sizes for the conversion. - * Usually, it is able to detect most of the data-type and convertor problems. Any modifications - * on the data-type engine should first pass all the tests from this file, before going into other - * tests. + * Go over a set of datatypes and copy them using the raw functionality provided by the + * convertor. The goal of this test is to stress the convertor using several more or less + * difficult datatype, with a large set of segment sizes for the conversion. It can be used + * to highlight the raw capability of the convertor as well as detecting datatype convertor + * problems. + * + * This test is part of the testing infrastructure for the core datatype engine. As such any + * modifications on the datatype engine should first pass all the tests from this file, + * before going into other tests. */ int main( int argc, char* argv[] ) { @@ -226,7 +230,7 @@ int main( int argc, char* argv[] ) OBJ_RELEASE( pdt3 ); assert( pdt3 == NULL ); printf( ">>--------------------------------------------<<\n" ); - printf( " Contiguous data-type (MPI_DOUBLE)\n" ); + printf( " Contiguous datatype (MPI_DOUBLE)\n" ); pdt = MPI_DOUBLE; if( outputFlags & CHECK_PACK_UNPACK ) { local_copy_ddt_raw(pdt, 4500, iov_num); @@ -235,37 +239,37 @@ int main( int argc, char* argv[] ) printf( ">>--------------------------------------------<<\n" ); if( outputFlags & CHECK_PACK_UNPACK ) { - printf( "Contiguous multiple data-type (4500*1)\n" ); + printf( "Contiguous multiple datatype (4500*1)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 4500 ); local_copy_ddt_raw(pdt, 1, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (450*10)\n" ); + printf( "Contiguous multiple datatype (450*10)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 450 ); local_copy_ddt_raw(pdt, 10, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (45*100)\n" ); + printf( "Contiguous multiple datatype (45*100)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 45 ); local_copy_ddt_raw(pdt, 100, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (100*45)\n" ); + printf( "Contiguous multiple datatype (100*45)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 100 ); local_copy_ddt_raw(pdt, 45, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (10*450)\n" ); + printf( "Contiguous multiple datatype (10*450)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 10 ); local_copy_ddt_raw(pdt, 450, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); - printf( "Contiguous multiple data-type (1*4500)\n" ); + printf( "Contiguous multiple datatype (1*4500)\n" ); pdt = create_contiguous_type( MPI_DOUBLE, 1 ); local_copy_ddt_raw(pdt, 4500, iov_num); OBJ_RELEASE( pdt ); assert( pdt == NULL ); } printf( ">>--------------------------------------------<<\n" ); printf( ">>--------------------------------------------<<\n" ); - printf( "Vector data-type (450 times 10 double stride 11)\n" ); + printf( "Vector datatype (450 times 10 double stride 11)\n" ); pdt = create_vector_type( MPI_DOUBLE, 450, 10, 11 ); if( outputFlags & DUMP_DATA_AFTER_COMMIT ) { - ompi_datatype_dump( pdt ); + ompi_datatype_dump( pdt ); } if( outputFlags & CHECK_PACK_UNPACK ) { local_copy_ddt_raw(pdt, 1, iov_num); @@ -292,9 +296,9 @@ int main( int argc, char* argv[] ) printf( ">>--------------------------------------------<<\n" ); pdt = test_create_blacs_type(); if( outputFlags & CHECK_PACK_UNPACK ) { - if( outputFlags & DUMP_DATA_AFTER_COMMIT ) { - ompi_datatype_dump( pdt ); - } + if( outputFlags & DUMP_DATA_AFTER_COMMIT ) { + ompi_datatype_dump( pdt ); + } local_copy_ddt_raw(pdt, 4500, iov_num); } printf( ">>--------------------------------------------<<\n" ); diff --git a/test/datatype/ddt_raw2.c b/test/datatype/ddt_raw2.c index cc78e23006a..7e91a323f7a 100644 --- a/test/datatype/ddt_raw2.c +++ b/test/datatype/ddt_raw2.c @@ -33,9 +33,6 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, uint32_t *iovec_count, int increment) { - - - opal_convertor_t *convertor; size_t remaining_length = 0; uint32_t i; @@ -43,7 +40,6 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, struct iovec *temp_iov=NULL; size_t temp_data; - convertor = opal_convertor_create( opal_local_arch, 0 ); if (OMPI_SUCCESS != opal_convertor_prepare_for_send (convertor, @@ -55,9 +51,9 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, } if ( 0 == datatype->super.size ) { - *iovec_count = 0; - *iov = NULL; - return OMPI_SUCCESS; + *iovec_count = 0; + *iov = NULL; + return OMPI_SUCCESS; } remaining_length = count * datatype->super.size; @@ -69,10 +65,8 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, return OMPI_ERR_OUT_OF_RESOURCE; } - while (0 == opal_convertor_raw(convertor, - temp_iov, - &temp_count, - &temp_data)) { + while (0 == opal_convertor_raw(convertor, temp_iov, + &temp_count, &temp_data)) { *iovec_count = *iovec_count + temp_count; *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); if (NULL == *iov) { @@ -80,7 +74,7 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, free(temp_iov); return OMPI_ERR_OUT_OF_RESOURCE; } - for (i=0 ; i 0 ) { - *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); - if (NULL == *iov) { - opal_output(1, "OUT OF MEMORY\n"); + *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); + if (NULL == *iov) { + opal_output(1, "OUT OF MEMORY\n"); free(temp_iov); - return OMPI_ERR_OUT_OF_RESOURCE; - } + return OMPI_ERR_OUT_OF_RESOURCE; + } } for (i=0 ; idesc.used + 2 ); - if( (bLength == stride) || (1 >= count) ) { /* the elements are contiguous */ + if( (bLength == stride) || (1 == count) ) { /* the elements are contiguous */ opal_datatype_add( pData, oldType, count * bLength, 0, extent ); } else { if( 1 == bLength ) { @@ -476,7 +476,7 @@ static int32_t opal_datatype_create_hvector( int count, int bLength, ptrdiff_t s } pTempData = opal_datatype_create( oldType->desc.used + 2 ); - if( ((extent * bLength) == stride) || (1 >= count) ) { /* contiguous */ + if( ((extent * bLength) == stride) || (1 == count) ) { /* contiguous */ pData = pTempData; opal_datatype_add( pData, oldType, count * bLength, 0, extent ); } else { diff --git a/test/datatype/to_self.c b/test/datatype/to_self.c index 58849f5e90c..073fe4f0b57 100644 --- a/test/datatype/to_self.c +++ b/test/datatype/to_self.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT$ @@ -15,8 +15,9 @@ #include #include #include +#include -#if OPEN_MPI && 0 +#if 0 && OPEN_MPI extern void ompi_datatype_dump( MPI_Datatype ddt ); #define MPI_DDT_DUMP(ddt) ompi_datatype_dump( (ddt) ) #else @@ -178,23 +179,145 @@ create_indexed_gap_optimized_ddt( void ) return dt3; } -static void print_result( int length, int cycles, double time ) -{ - double bandwidth, clock_prec; +/******************************************************************** + *******************************************************************/ + +#define DO_CONTIG 0x00000001 +#define DO_CONSTANT_GAP 0x00000002 +#define DO_INDEXED_GAP 0x00000004 +#define DO_OPTIMIZED_INDEXED_GAP 0x00000008 +#define DO_STRUCT_CONSTANT_GAP_RESIZED 0x00000010 + +#define DO_PACK 0x01000000 +#define DO_UNPACK 0x02000000 +#define DO_ISEND_RECV 0x04000000 +#define DO_ISEND_IRECV 0x08000000 +#define DO_IRECV_SEND 0x10000000 +#define DO_IRECV_ISEND 0x20000000 + +#define MIN_LENGTH 1024 +#define MAX_LENGTH (1024*1024) + +static int cycles = 100; +static int trials = 20; +static int warmups = 2; + +static void print_result( int length, int trials, double* timers ) +{ + double bandwidth, clock_prec, temp; + double min_time, max_time, average, std_dev = 0.0; + double ordered[trials]; + int t, pos, quartile_start, quartile_end; + + for( t = 0; t < trials; ordered[t] = timers[t], t++ ); + for( t = 0; t < trials-1; t++ ) { + temp = ordered[t]; + pos = t; + for( int i = t+1; i < trials; i++ ) { + if( temp > ordered[i] ) { + temp = ordered[i]; + pos = i; + } + } + if( pos != t ) { + temp = ordered[t]; + ordered[t] = ordered[pos]; + ordered[pos] = temp; + } + } + quartile_start = trials - (3 * trials) / 4; + quartile_end = trials - (1 * trials) / 4; clock_prec = MPI_Wtick(); - bandwidth = (length * clock_prec * cycles) / (1024.0 * 1024.0) / (time * clock_prec); - printf( "%8d\t%.6f\t%.4f MB/s\n", length, time / cycles, bandwidth ); + min_time = ordered[quartile_start]; + max_time = ordered[quartile_start]; + average = ordered[quartile_start]; + for( t = quartile_start + 1; t < quartile_end; t++ ) { + if( min_time > ordered[t] ) min_time = ordered[t]; + if( max_time < ordered[t] ) max_time = ordered[t]; + average += ordered[t]; + } + average /= (quartile_end - quartile_start); + for( t = quartile_start; t < quartile_end; t++ ) { + std_dev += (ordered[t] - average) * (ordered[t] - average); + } + std_dev = sqrt( std_dev/(quartile_end - quartile_start) ); + + bandwidth = (length * clock_prec) / (1024.0 * 1024.0) / (average * clock_prec); + printf( "%8d\t%15g\t%10.4f MB/s [min %10g max %10g std %2.2f%%]\n", length, average, bandwidth, + min_time, max_time, (100.0 * std_dev) / average ); +} + +static int pack( int cycles, + MPI_Datatype sdt, int scount, void* sbuf, + void* packed_buf ) +{ + int position, myself, c, t, outsize; + double timers[trials]; + + MPI_Type_size( sdt, &outsize ); + outsize *= scount; + + MPI_Comm_rank( MPI_COMM_WORLD, &myself ); + + for( t = 0; t < warmups; t++ ) { + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD); + } + } + + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; + } + print_result( outsize, trials, timers ); + return 0; +} + +static int unpack( int cycles, + void* packed_buf, + MPI_Datatype rdt, int rcount, void* rbuf ) +{ + int position, myself, c, t, insize; + double timers[trials]; + + MPI_Type_size( rdt, &insize ); + insize *= rcount; + + MPI_Comm_rank( MPI_COMM_WORLD, &myself ); + + for( t = 0; t < warmups; t++ ) { + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD); + } + } + + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + position = 0; + MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; + } + print_result( insize, trials, timers ); + return 0; } static int isend_recv( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; + int myself, tag = 0, c, t, slength, rlength; MPI_Status status; MPI_Request req; - double tstart, tend; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -203,21 +326,16 @@ static int isend_recv( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); - MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); - MPI_Wait( &req, &status ); - /*MPI_Request_free( &req );*/ -#else - ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); - ftmpi_mpi_recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); - ftmpi_request_free( &req ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); + MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); + MPI_Wait( &req, &status ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers ); return 0; } @@ -225,10 +343,10 @@ static int irecv_send( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; + int myself, tag = 0, c, t, slength, rlength; MPI_Request req; MPI_Status status; - double tstart, tend; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -237,21 +355,16 @@ static int irecv_send( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); - MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); - MPI_Wait( &req, &status ); - /*MPI_Request_free( &req );*/ -#else - ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); - ftmpi_mpi_send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); - ftmpi_request_free( &req ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); + MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); + MPI_Wait( &req, &status ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers ); return 0; } @@ -259,10 +372,10 @@ static int isend_irecv_wait( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; - MPI_Request sreq, rreq; - MPI_Status status; - double tstart, tend; + int myself, tag = 0, c, t, slength, rlength; + MPI_Request requests[2]; + MPI_Status statuses[2]; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -271,25 +384,16 @@ static int isend_irecv_wait( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - MPI_Wait( &sreq, &status ); - MPI_Wait( &rreq, &status ); - /*MPI_Request_free( &sreq );*/ - /*MPI_Request_free( &rreq );*/ -#else - ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - ftmpi_wait( &sreq, &status ); - ftmpi_request_free( &sreq ); - ftmpi_request_free( &rreq ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[0] ); + MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[1] ); + MPI_Waitall( 2, requests, statuses ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers ); return 0; } @@ -297,10 +401,10 @@ static int irecv_isend_wait( int cycles, MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype rdt, int rcount, void* rbuf ) { - int myself, tag = 0, i, slength, rlength; - MPI_Request sreq, rreq; - MPI_Status status; - double tstart, tend; + int myself, tag = 0, c, t, slength, rlength; + MPI_Request requests[2]; + MPI_Status statuses[2]; + double timers[trials]; MPI_Type_size( sdt, &slength ); slength *= scount; @@ -309,74 +413,82 @@ static int irecv_isend_wait( int cycles, MPI_Comm_rank( MPI_COMM_WORLD, &myself ); - tstart = MPI_Wtime(); - for( i = 0; i < cycles; i++ ) { -#ifndef FAST - MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - MPI_Wait( &sreq, &status ); - MPI_Wait( &rreq, &status ); - /*MPI_Request_free( &sreq );*/ - /*MPI_Request_free( &rreq );*/ -#else - ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); - ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); - ftmpi_wait( &sreq, &status ); - ftmpi_request_free( &sreq ); - ftmpi_request_free( &rreq ); -#endif + for( t = 0; t < trials; t++ ) { + timers[t] = MPI_Wtime(); + for( c = 0; c < cycles; c++ ) { + MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[0] ); + MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[1] ); + MPI_Waitall( 2, requests, statuses ); + } + timers[t] = (MPI_Wtime() - timers[t]) / cycles; } - tend = MPI_Wtime(); - print_result( rlength, cycles, tend - tstart ); + print_result( rlength, trials, timers); return 0; } -static int do_test_for_ddt( MPI_Datatype sddt, MPI_Datatype rddt, int length ) +static int do_test_for_ddt( int doop, MPI_Datatype sddt, MPI_Datatype rddt, int length ) { - int i; MPI_Aint lb, extent; char *sbuf, *rbuf; + int i; MPI_Type_get_extent( sddt, &lb, &extent ); sbuf = (char*)malloc( length ); rbuf = (char*)malloc( length ); - printf( "# Isend recv (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - isend_recv( 10, sddt, i, sbuf, rddt, i, rbuf ); + if( doop & DO_PACK ) { + printf("# Pack (max length %d)\n", length); + for( i = 1; i <= (length/extent); i *= 2 ) { + pack( cycles, sddt, i, sbuf, rbuf ); + } } - printf( "# Isend Irecv Wait (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - isend_irecv_wait( 10, sddt, i, sbuf, rddt, i, rbuf ); + + if( doop & DO_UNPACK ) { + printf("# Unpack (length %d)\n", length); + for( i = 1; i <= (length/extent); i *= 2 ) { + unpack( cycles, sbuf, rddt, i, rbuf ); + } } - printf( "# Irecv send (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - irecv_send( 10, sddt, i, sbuf, rddt, i, rbuf ); + + if( doop & DO_ISEND_RECV ) { + printf( "# Isend recv (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + isend_recv( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } } - printf( "# Irecv Isend Wait (length %d)\n", length ); - for( i = 1; i <= (length/extent); i *= 2 ) { - irecv_isend_wait( 10, sddt, i, sbuf, rddt, i, rbuf ); + + if( doop & DO_ISEND_IRECV ) { + printf( "# Isend Irecv Wait (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + isend_irecv_wait( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } + } + + if( doop & DO_IRECV_SEND ) { + printf( "# Irecv send (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + irecv_send( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } + } + + if( doop & DO_IRECV_SEND ) { + printf( "# Irecv Isend Wait (length %d)\n", length ); + for( i = 1; i <= (length/extent); i *= 2 ) { + irecv_isend_wait( cycles, sddt, i, sbuf, rddt, i, rbuf ); + } } free( sbuf ); free( rbuf ); return 0; } -#define DO_CONTIG 0x01 -#define DO_CONSTANT_GAP 0x02 -#define DO_INDEXED_GAP 0x04 -#define DO_OPTIMIZED_INDEXED_GAP 0x08 -#define DO_STRUCT_CONSTANT_GAP_RESIZED 0x10 - -#define MIN_LENGTH 1024 -#define MAX_LENGTH (1024*1024) - int main( int argc, char* argv[] ) { - int run_tests = 0xffffffff; /* do all tests by default */ - int length, rank, size; + int run_tests = 0xffff; /* do all datatype tests by default */ + int rank, size; MPI_Datatype ddt; - /*int run_tests = DO_CONSTANT_GAP;*/ + run_tests |= DO_PACK | DO_UNPACK; + MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); @@ -389,16 +501,14 @@ int main( int argc, char* argv[] ) if( run_tests & DO_CONTIG ) { printf( "\ncontiguous datatype\n\n" ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( MPI_INT, MPI_INT, length ); + do_test_for_ddt( run_tests, MPI_INT, MPI_INT, MAX_LENGTH ); } if( run_tests & DO_INDEXED_GAP ) { printf( "\nindexed gap\n\n" ); ddt = create_indexed_gap_ddt(); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -406,8 +516,7 @@ int main( int argc, char* argv[] ) printf( "\noptimized indexed gap\n\n" ); ddt = create_indexed_gap_optimized_ddt(); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -415,8 +524,7 @@ int main( int argc, char* argv[] ) printf( "\nconstant indexed gap\n\n" ); ddt = create_indexed_constant_gap_ddt( 80, 100, 1 ); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -424,8 +532,7 @@ int main( int argc, char* argv[] ) printf( "\noptimized constant indexed gap\n\n" ); ddt = create_optimized_indexed_constant_gap_ddt( 80, 100, 1 ); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } @@ -433,8 +540,7 @@ int main( int argc, char* argv[] ) printf( "\nstruct constant gap resized\n\n" ); ddt = create_struct_constant_gap_resized_ddt( 0 /* unused */, 0 /* unused */, 0 /* unused */ ); MPI_DDT_DUMP( ddt ); - for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) - do_test_for_ddt( ddt, ddt, length ); + do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH ); MPI_Type_free( &ddt ); } diff --git a/test/datatype/unpack_ooo.c b/test/datatype/unpack_ooo.c index 458ef550930..58ef8a95774 100644 --- a/test/datatype/unpack_ooo.c +++ b/test/datatype/unpack_ooo.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -18,7 +18,6 @@ #include "opal/runtime/opal.h" #include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_datatype_internal.h" -// #include #include #include #include @@ -61,6 +60,18 @@ static void print_bar_pbar(struct foo_t* bar, struct pfoo_t* pbar) fprintf(stderr, "\n"); } +static void print_stack(opal_convertor_t* conv) +{ + printf("Stack pos %d [converted %" PRIsize_t "/%" PRIsize_t "]\n", + conv->stack_pos, conv->bConverted, conv->local_size); + for( uint32_t i = 0; i <= conv->stack_pos; i++ ) { + printf( "[%u] index %d, type %s count %" PRIsize_t " disp %p\n", + i, conv->pStack[i].index, opal_datatype_basicDatatypes[conv->pStack[i].type]->name, + conv->pStack[i].count, (void*)conv->pStack[i].disp); + } + printf("\n"); +} + static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { int i, j, errors = 0; struct iovec a; @@ -104,6 +115,7 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { max_data = a.iov_len; pos = arr[i][1]; opal_convertor_set_position(pConv, &pos); + print_stack(pConv); assert(arr[i][1] == pos); opal_convertor_unpack( pConv, &a, &iov_count, &max_data ); a.iov_base = (char*)a.iov_base - 1024; @@ -118,9 +130,10 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { bar[j].d[1] != 0.0 || bar[j].d[2] != pbar[j].d[1]) { if(0 == errors) { - fprintf(stderr, "ERROR ! count=%d, position=%d, ptr = %p" + (void)opal_datatype_dump(&newtype->super); + fprintf(stderr, "ERROR ! position=%d/%d, ptr = %p" " got (%d,%d,%d,%g,%g,%g) expected (%d,%d,%d,%g,%g,%g)\n", - N, j, (void*)&bar[j], + j, N, (void*)&bar[j], bar[j].i[0], bar[j].i[1], bar[j].i[2],