diff --git a/opal/datatype/opal_datatype_checksum.h b/opal/datatype/opal_datatype_checksum.h index 64fecd179b9..02e374e9099 100644 --- a/opal/datatype/opal_datatype_checksum.h +++ b/opal/datatype/opal_datatype_checksum.h @@ -44,11 +44,17 @@ do { \ (CONVERTOR)->checksum += OPAL_CSUM_PARTIAL( (SRC), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ } while (0) +#define BASIC_DTT_MEMCPY_CSUM(DST, SRC, BLENGTH, CONVERTOR ) \ + MEMCPY_CSUM( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #else /* if CHECKSUM */ #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ MEMCPY( (DST), (SRC), (BLENGTH) ) +#define BASIC_DTT_MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ + BASIC_DTT_MEMCPY( (DST), (SRC), (BLENGTH) ) + #define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR ) #endif /* if CHECKSUM */ diff --git a/opal/datatype/opal_datatype_memcpy.h b/opal/datatype/opal_datatype_memcpy.h index 972009ac96a..c9c17bbfbc7 100644 --- a/opal/datatype/opal_datatype_memcpy.h +++ b/opal/datatype/opal_datatype_memcpy.h @@ -17,4 +17,23 @@ #define MEMCPY( DST, SRC, BLENGTH ) \ memcpy( (DST), (SRC), (BLENGTH) ) +/* + * This macro is called whenever we are packing/unpacking a DDT that + * that is built with basic datatypes. + * Specifying a fixed size for the memcpy() makes the intel compiler + * inline it as an assignment operation. + * This code is a bit hacky, but doing this we can divide the latency + * by up to 2 during DDT exechanges. + */ +#define BASIC_DTT_MEMCPY( DST, SRC, BLENGTH ) \ + do { \ + if (4 == (BLENGTH)) { /* We are copying an int */ \ + memcpy((DST), (SRC), 4); \ + } else if (8 == (BLENGTH)) { /* We are copying a double */ \ + memcpy((DST), (SRC), 8); \ + } else { \ + memcpy((DST), (SRC), (BLENGTH)); \ + } \ + } while (0) + #endif /* OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index f952cabc3c0..3d44a7345b1 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -26,6 +26,11 @@ #undef MEMCPY_CSUM #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + +#undef BASIC_DTT_MEMCPY_CSUM +#define BASIC_DTT_MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ + MEMCPY_CSUM( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, @@ -53,7 +58,7 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n", (void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); + BASIC_DTT_MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); _source += _copy_blength; *(DESTINATION) += _copy_blength; } else { diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index d837aad5ab7..8c2882bcd58 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -24,6 +24,11 @@ #undef MEMCPY_CSUM #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + +#undef BASIC_DTT_MEMCPY_CSUM +#define BASIC_DTT_MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ + MEMCPY_CSUM( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif static inline void @@ -52,7 +57,7 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */ (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu\n", (void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); + BASIC_DTT_MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); *(SOURCE) += _copy_blength; _destination += _copy_blength; } else {