diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h index e2ee2a79c01..26978d0867e 100644 --- a/ompi/datatype/ompi_datatype.h +++ b/ompi/datatype/ompi_datatype.h @@ -59,7 +59,7 @@ BEGIN_C_DECLS #define OMPI_DATATYPE_FLAG_DATA_FORTRAN 0xC000 #define OMPI_DATATYPE_FLAG_DATA_LANGUAGE 0xC000 -#define OMPI_DATATYPE_MAX_PREDEFINED 50 +#define OMPI_DATATYPE_MAX_PREDEFINED 52 #if OMPI_DATATYPE_MAX_PREDEFINED > OPAL_DATATYPE_MAX_SUPPORTED #error Need to increase the number of supported dataypes by OPAL (value OPAL_DATATYPE_MAX_SUPPORTED). diff --git a/ompi/datatype/ompi_datatype_external.c b/ompi/datatype/ompi_datatype_external.c index 53b907218cf..75ca59354bc 100644 --- a/ompi/datatype/ompi_datatype_external.c +++ b/ompi/datatype/ompi_datatype_external.c @@ -126,7 +126,7 @@ int ompi_datatype_pack_external_size(const char datarep[], int incount, CONVERTOR_SEND_CONVERSION, &local_convertor ); - opal_convertor_get_unpacked_size( &local_convertor, &length ); + opal_convertor_get_packed_size( &local_convertor, &length ); *size = (MPI_Aint)length; OBJ_DESTRUCT( &local_convertor ); diff --git a/ompi/datatype/ompi_datatype_external32.c b/ompi/datatype/ompi_datatype_external32.c index 108e14258b7..9f1e6242412 100644 --- a/ompi/datatype/ompi_datatype_external32.c +++ b/ompi/datatype/ompi_datatype_external32.c @@ -26,39 +26,74 @@ /* From the MPI standard. external32 use the following types: * Type Length - * MPI_PACKED 1 - * MPI_BYTE 1 - * MPI_CHAR 1 - * MPI_UNSIGNED_CHAR 1 - * MPI_SIGNED_CHAR 1 - * MPI_WCHAR 2 - * MPI_SHORT 2 - * MPI_UNSIGNED_SHORT 2 - * MPI_INT 4 - * MPI_UNSIGNED 4 - * MPI_LONG 4 - * MPI_UNSIGNED_LONG 4 - * MPI_FLOAT 4 - * MPI_DOUBLE 8 - * MPI_LONG_DOUBLE 16 + * MPI_PACKED 1 + * MPI_BYTE 1 + * MPI_CHAR 1 + * MPI_UNSIGNED_CHAR 1 + * MPI_SIGNED_CHAR 1 + * MPI_WCHAR 2 + * MPI_SHORT 2 + * MPI_UNSIGNED_SHORT 2 + * MPI_INT 4 + * MPI_LONG 4 + * MPI_UNSIGNED 4 + * MPI_UNSIGNED_LONG 4 + * MPI_LONG_LONG_INT 8 + * MPI_UNSIGNED_LONG_LONG 8 + * MPI_FLOAT 4 + * MPI_DOUBLE 8 + * MPI_LONG_DOUBLE 16 + * + * MPI_C_BOOL 1 + * MPI_INT8_T 1 + * MPI_INT16_T 2 + * MPI_INT32_T 4 + * MPI_INT64_T 8 + * MPI_UINT8_T 1 + * MPI_UINT16_T 2 + * MPI_UINT32_T 4 + * MPI_UINT64_T 8 + * MPI_AINT 8 + * MPI_COUNT 8 + * MPI_OFFSET 8 + * MPI_C_COMPLEX 2*4 + * MPI_C_FLOAT_COMPLEX 2*4 + * MPI_C_DOUBLE_COMPLEX 2*8 + * MPI_C_LONG_DOUBLE_COMPLEX 2*16 + * * Fortran types - * MPI_CHARACTER 1 - * MPI_LOGICAL 4 - * MPI_INTEGER 4 - * MPI_REAL 4 - * MPI_DOUBLE_PRECISION 8 - * MPI_COMPLEX 2*4 - * MPI_DOUBLE_COMPLEX 2*8 + * MPI_CHARACTER 1 + * MPI_LOGICAL 4 + * MPI_INTEGER 4 + * MPI_REAL 4 + * MPI_DOUBLE_PRECISION 8 + * MPI_COMPLEX 2*4 + * MPI_DOUBLE_COMPLEX 2*8 + * + * MPI_CXX_BOOL 1 + * MPI_CXX_FLOAT_COMPLEX 2*4 + * MPI_CXX_DOUBLE_COMPLEX 2*8 + * MPI_CXX_LONG_DOUBLE_COMPLEX 2*16 + * * Optional types - * MPI_INTEGER1 1 - * MPI_INTEGER2 2 - * MPI_INTEGER4 4 - * MPI_INTEGER8 8 - * MPI_LONG_LONG_INT 8 - * MPI_UNSIGNED_LONG_LONG 8 - * MPI_REAL4 4 - * MPI_REAL8 8 - * MPI_REAL16 16 + * MPI_INTEGER1 1 + * MPI_INTEGER2 2 + * MPI_INTEGER4 4 + * MPI_INTEGER8 8 + * MPI_INTEGER16 16 + * MPI_REAL2 2 + * MPI_REAL4 4 + * MPI_REAL8 8 + * MPI_REAL16 16 + * MPI_COMPLEX4 2*2 + * MPI_COMPLEX8 2*4 + * MPI_COMPLEX16 2*8 + * MPI_COMPLEX32 2*16 + * + * MPI_CXX_BOOL 1 + * MPI_CXX_FLOAT_COMPLEX 2*4 + * MPI_CXX_DOUBLE_COMPLEX 2*8 + * MPI_CXX_LONG_DOUBLE_COMPLEX 2*16 * * All floating point values are in big-endian IEEE format. Double extended use 16 bytes, with * 15 exponent bits (bias = 10383), 112 mantissa bits and the same encoding as double. All diff --git a/ompi/datatype/ompi_datatype_internal.h b/ompi/datatype/ompi_datatype_internal.h index 1b137c1f947..e46f5137de1 100644 --- a/ompi/datatype/ompi_datatype_internal.h +++ b/ompi/datatype/ompi_datatype_internal.h @@ -109,8 +109,14 @@ #define OMPI_DATATYPE_MPI_SHORT_FLOAT 0x30 #define OMPI_DATATYPE_MPI_C_SHORT_FLOAT_COMPLEX 0x31 +/* + * Datatypes that have a different external32 length. + */ +#define OMPI_DATATYPE_MPI_LONG 0x32 +#define OMPI_DATATYPE_MPI_UNSIGNED_LONG 0x33 + /* This should __ALWAYS__ stay last */ -#define OMPI_DATATYPE_MPI_UNAVAILABLE 0x32 +#define OMPI_DATATYPE_MPI_UNAVAILABLE 0x34 #define OMPI_DATATYPE_MPI_MAX_PREDEFINED (OMPI_DATATYPE_MPI_UNAVAILABLE+1) @@ -177,20 +183,6 @@ #define OMPI_DATATYPE_MPI_UNSIGNED OMPI_DATATYPE_MPI_UINT64_T #endif -#if SIZEOF_LONG == 1 -#define OMPI_DATATYPE_MPI_LONG OMPI_DATATYPE_MPI_INT8_T -#define OMPI_DATATYPE_MPI_UNSIGNED_LONG OMPI_DATATYPE_MPI_UINT8_T -#elif SIZEOF_LONG == 2 -#define OMPI_DATATYPE_MPI_LONG OMPI_DATATYPE_MPI_INT16_T -#define OMPI_DATATYPE_MPI_UNSIGNED_LONG OMPI_DATATYPE_MPI_UINT16_T -#elif SIZEOF_LONG == 4 -#define OMPI_DATATYPE_MPI_LONG OMPI_DATATYPE_MPI_INT32_T -#define OMPI_DATATYPE_MPI_UNSIGNED_LONG OMPI_DATATYPE_MPI_UINT32_T -#elif SIZEOF_LONG == 8 -#define OMPI_DATATYPE_MPI_LONG OMPI_DATATYPE_MPI_INT64_T -#define OMPI_DATATYPE_MPI_UNSIGNED_LONG OMPI_DATATYPE_MPI_UINT64_T -#endif - #if SIZEOF_LONG_LONG == 1 #define OMPI_DATATYPE_MPI_LONG_LONG_INT OMPI_DATATYPE_MPI_INT8_T #define OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG OMPI_DATATYPE_MPI_UINT8_T @@ -571,16 +563,8 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX #define OMPI_DATATYPE_INITIALIZER_UNSIGNED OPAL_DATATYPE_INITIALIZER_UINT8 #endif -#if SIZEOF_LONG == 4 -#define OMPI_DATATYPE_INITIALIZER_LONG OPAL_DATATYPE_INITIALIZER_INT4 -#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG OPAL_DATATYPE_INITIALIZER_UINT4 -#elif SIZEOF_LONG == 8 -#define OMPI_DATATYPE_INITIALIZER_LONG OPAL_DATATYPE_INITIALIZER_INT8 -#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG OPAL_DATATYPE_INITIALIZER_UINT8 -#elif SIZEOF_LONG == 16 -#define OMPI_DATATYPE_INITIALIZER_LONG OPAL_DATATYPE_INITIALIZER_INT16 -#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG OPAL_DATATYPE_INITIALIZER_UINT16 -#endif +#define OMPI_DATATYPE_INITIALIZER_LONG OPAL_DATATYPE_INITIALIZER_LONG +#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG OPAL_DATATYPE_INITIALIZER_UNSIGNED_LONG #if SIZEOF_LONG_LONG == 4 #define OMPI_DATATYPE_INITIALIZER_LONG_LONG_INT OPAL_DATATYPE_INITIALIZER_INT4 diff --git a/ompi/datatype/ompi_datatype_module.c b/ompi/datatype/ompi_datatype_module.c index 232f0d90507..5a9a0aa9110 100644 --- a/ompi/datatype/ompi_datatype_module.c +++ b/ompi/datatype/ompi_datatype_module.c @@ -366,6 +366,8 @@ const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX_PREDEF [OMPI_DATATYPE_MPI_LB] = &ompi_mpi_lb.dt, [OMPI_DATATYPE_MPI_UB] = &ompi_mpi_ub.dt, + [OMPI_DATATYPE_MPI_LONG] = &ompi_mpi_long.dt, + [OMPI_DATATYPE_MPI_UNSIGNED_LONG] = &ompi_mpi_long.dt, /* MPI 3.0 types */ [OMPI_DATATYPE_MPI_COUNT] = &ompi_mpi_count.dt, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index b0b89388dea..c9ffd2a9ba6 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -234,8 +234,8 @@ static inline void prepare_recv_req_converter(mca_pml_ob1_recv_request_t *req) req->req_recv.req_base.req_addr, 0, &req->req_recv.req_base.req_convertor); - opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor, - &req->req_bytes_expected); + opal_convertor_get_packed_size(&req->req_recv.req_base.req_convertor, + &req->req_bytes_expected); } } diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 34efaaa8f16..e08265b42bc 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -142,7 +142,13 @@ opal_convertor_master_t *opal_convertor_find_or_create_master(uint32_t remote_ar } else { opal_output(0, "Unknown sizeof(bool) for the remote architecture\n"); } - + if (opal_arch_checkmask(&master->remote_arch, OPAL_ARCH_LONGIS64)) { + remote_sizes[OPAL_DATATYPE_LONG] = 8; + remote_sizes[OPAL_DATATYPE_UNSIGNED_LONG] = 8; + } else { + remote_sizes[OPAL_DATATYPE_LONG] = 4; + remote_sizes[OPAL_DATATYPE_UNSIGNED_LONG] = 4; + } /** * Now we can compute the conversion mask. For all sizes where the remote * and local architecture differ a conversion is needed. Moreover, if the @@ -434,7 +440,7 @@ int32_t opal_convertor_set_position_nocheck(opal_convertor_t *convertor, size_t } rc = opal_convertor_generic_simple_position(convertor, position); /** - * If we have a non-contigous send convertor don't allow it move in the middle + * If we have a non-contiguous send convertor don't allow it move in the middle * of a predefined datatype, it won't be able to copy out the left-overs * anyway. Instead force the position to stay on predefined datatypes * boundaries. As we allow partial predefined datatypes on the contiguous @@ -484,8 +490,8 @@ size_t opal_convertor_compute_remote_size(opal_convertor_t *pConvertor) pConvertor->remote_size = pConvertor->local_size; if (OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask)) { pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS); - if (!(pConvertor->flags & CONVERTOR_SEND - && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS)) { + /* Can we use the optimized description? */ + if (pConvertor->flags & OPAL_DATATYPE_OPTIMIZED_RESTRICTED) { pConvertor->use_desc = &(datatype->desc); } if (0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE)) { diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h index 05cc2b2ec67..396f959090d 100644 --- a/opal/datatype/opal_convertor.h +++ b/opal/datatype/opal_convertor.h @@ -199,22 +199,19 @@ static inline int32_t opal_convertor_need_buffers(const opal_convertor_t *pConve size_t opal_convertor_compute_remote_size(opal_convertor_t *pConv); /** - * Return the local size of the convertor (count times the size of the datatype). + * Return the packed size of the memory layout represented by this + * convertor. This is the size of the buffer that would be needed + * for the conversion (takes in account the type of the operation, + * aka pack or unpack, as well as which side is supposed to do the + * type conversion). */ -static inline void opal_convertor_get_packed_size(const opal_convertor_t *pConv, size_t *pSize) +static inline void +opal_convertor_get_packed_size(const opal_convertor_t *pConv, size_t *pSize) { *pSize = pConv->local_size; -} - -/** - * Return the remote size of the convertor (count times the remote size of the - * datatype). On homogeneous environments the local and remote sizes are - * identical. - */ -static inline void opal_convertor_get_unpacked_size(const opal_convertor_t *pConv, size_t *pSize) -{ - if (pConv->flags & CONVERTOR_HOMOGENEOUS) { - *pSize = pConv->local_size; + if ((pConv->flags & CONVERTOR_HOMOGENEOUS) || + ((pConv->flags & CONVERTOR_SEND) && !(pConv->flags & CONVERTOR_SEND_CONVERSION)) || + ((pConv->flags & CONVERTOR_RECV) && (pConv->flags & CONVERTOR_SEND_CONVERSION))) { return; } if (0 == (CONVERTOR_HAS_REMOTE_SIZE & pConv->flags)) { diff --git a/opal/datatype/opal_copy_functions.c b/opal/datatype/opal_copy_functions.c index 0cb8b50c274..fec19a2a70e 100644 --- a/opal/datatype/opal_copy_functions.c +++ b/opal/datatype/opal_copy_functions.c @@ -62,10 +62,10 @@ \ if ((from_extent == (ptrdiff_t) local_TYPE_size) \ && (to_extent == (ptrdiff_t) remote_TYPE_size)) { \ - /* copy of contigous data at both source and destination */ \ + /* copy of contiguous data at both source and destination */ \ MEMCPY(to, from, count *local_TYPE_size); \ } else { \ - /* source or destination are non-contigous */ \ + /* source or destination are non-contiguous */ \ for (size_t i = 0; i < count; i++) { \ MEMCPY(to, from, local_TYPE_size); \ to += to_extent; \ @@ -254,30 +254,37 @@ COPY_TYPE(wchar, wchar_t, 1) /* Table of predefined copy functions - one for each OPAL type */ /* NOTE: The order of this array *MUST* match the order in opal_datatype_basicDatatypes */ conversion_fct_t opal_datatype_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED] = { - (conversion_fct_t) NULL, /* OPAL_DATATYPE_LOOP */ - (conversion_fct_t) NULL, /* OPAL_DATATYPE_END_LOOP */ - (conversion_fct_t) NULL, /* OPAL_DATATYPE_LB */ - (conversion_fct_t) NULL, /* OPAL_DATATYPE_UB */ - (conversion_fct_t) copy_bytes_1, /* OPAL_DATATYPE_INT1 */ - (conversion_fct_t) copy_bytes_2, /* OPAL_DATATYPE_INT2 */ - (conversion_fct_t) copy_bytes_4, /* OPAL_DATATYPE_INT4 */ - (conversion_fct_t) copy_bytes_8, /* OPAL_DATATYPE_INT8 */ - (conversion_fct_t) copy_bytes_16, /* OPAL_DATATYPE_INT16 */ - (conversion_fct_t) copy_bytes_1, /* OPAL_DATATYPE_UINT1 */ - (conversion_fct_t) copy_bytes_2, /* OPAL_DATATYPE_UINT2 */ - (conversion_fct_t) copy_bytes_4, /* OPAL_DATATYPE_UINT4 */ - (conversion_fct_t) copy_bytes_8, /* OPAL_DATATYPE_UINT8 */ - (conversion_fct_t) copy_bytes_16, /* OPAL_DATATYPE_UINT16 */ - (conversion_fct_t) copy_float_2, /* OPAL_DATATYPE_FLOAT2 */ - (conversion_fct_t) copy_float_4, /* OPAL_DATATYPE_FLOAT4 */ - (conversion_fct_t) copy_float_8, /* OPAL_DATATYPE_FLOAT8 */ - (conversion_fct_t) copy_float_12, /* OPAL_DATATYPE_FLOAT12 */ - (conversion_fct_t) copy_float_16, /* OPAL_DATATYPE_FLOAT16 */ - (conversion_fct_t) copy_short_float_complex, /* OPAL_DATATYPE_SHORT_FLOAT_COMPLEX */ - (conversion_fct_t) copy_float_complex, /* OPAL_DATATYPE_FLOAT_COMPLEX */ - (conversion_fct_t) copy_double_complex, /* OPAL_DATATYPE_DOUBLE_COMPLEX */ - (conversion_fct_t) copy_long_double_complex, /* OPAL_DATATYPE_LONG_DOUBLE_COMPLEX */ - (conversion_fct_t) copy_bool, /* OPAL_DATATYPE_BOOL */ - (conversion_fct_t) copy_wchar, /* OPAL_DATATYPE_WCHAR */ - (conversion_fct_t) NULL /* OPAL_DATATYPE_UNAVAILABLE */ + [OPAL_DATATYPE_LOOP] = (conversion_fct_t) NULL, + [OPAL_DATATYPE_END_LOOP] = (conversion_fct_t) NULL, + [OPAL_DATATYPE_LB] = (conversion_fct_t) NULL, + [OPAL_DATATYPE_UB] = (conversion_fct_t) NULL, + [OPAL_DATATYPE_INT1] = (conversion_fct_t) copy_bytes_1, + [OPAL_DATATYPE_INT2] = (conversion_fct_t) copy_bytes_2, + [OPAL_DATATYPE_INT4] = (conversion_fct_t) copy_bytes_4, + [OPAL_DATATYPE_INT8] = (conversion_fct_t) copy_bytes_8, + [OPAL_DATATYPE_INT16] = (conversion_fct_t) copy_bytes_16, + [OPAL_DATATYPE_UINT1] = (conversion_fct_t) copy_bytes_1, + [OPAL_DATATYPE_UINT2] = (conversion_fct_t) copy_bytes_2, + [OPAL_DATATYPE_UINT4] = (conversion_fct_t) copy_bytes_4, + [OPAL_DATATYPE_UINT8] = (conversion_fct_t) copy_bytes_8, + [OPAL_DATATYPE_UINT16] = (conversion_fct_t) copy_bytes_16, + [OPAL_DATATYPE_FLOAT2] = (conversion_fct_t) copy_float_2, + [OPAL_DATATYPE_FLOAT4] = (conversion_fct_t) copy_float_4, + [OPAL_DATATYPE_FLOAT8] = (conversion_fct_t) copy_float_8, + [OPAL_DATATYPE_FLOAT12] = (conversion_fct_t) copy_float_12, + [OPAL_DATATYPE_FLOAT16] = (conversion_fct_t) copy_float_16, + [OPAL_DATATYPE_SHORT_FLOAT_COMPLEX] = (conversion_fct_t) copy_short_float_complex, + [OPAL_DATATYPE_FLOAT_COMPLEX] = (conversion_fct_t) copy_float_complex, + [OPAL_DATATYPE_DOUBLE_COMPLEX] = (conversion_fct_t) copy_double_complex, + [OPAL_DATATYPE_LONG_DOUBLE_COMPLEX] = (conversion_fct_t) copy_long_double_complex, + [OPAL_DATATYPE_BOOL] = (conversion_fct_t) copy_bool, + [OPAL_DATATYPE_WCHAR] = (conversion_fct_t) copy_wchar, +#if SIZEOF_LONG == 4 + [OPAL_DATATYPE_LONG] = (conversion_fct_t)copy_bytes_4, + [OPAL_DATATYPE_UNSIGNED_LONG] = (conversion_fct_t)copy_bytes_4, +#elif SIZEOF_LONG == 8 + [OPAL_DATATYPE_LONG] = (conversion_fct_t)copy_bytes_8, + [OPAL_DATATYPE_UNSIGNED_LONG] = (conversion_fct_t)copy_bytes_8, +#endif + [OPAL_DATATYPE_UNAVAILABLE] = NULL, }; diff --git a/opal/datatype/opal_copy_functions_heterogeneous.c b/opal/datatype/opal_copy_functions_heterogeneous.c index be2adf33bce..5529a7f5d53 100644 --- a/opal/datatype/opal_copy_functions_heterogeneous.c +++ b/opal/datatype/opal_copy_functions_heterogeneous.c @@ -176,7 +176,7 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons && (ptrdiff_t) sizeof(TYPE) == from_extent) { \ MEMCPY(to, from, count * sizeof(TYPE)); \ } else { \ - /* source or destination are non-contigous */ \ + /* source or destination are non-contiguous */ \ for (i = 0; i < count; i++) { \ MEMCPY(to, from, sizeof(TYPE)); \ to += to_extent; \ @@ -225,7 +225,7 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons && (ptrdiff_t) sizeof(TYPE) == from_extent) { \ MEMCPY(to, from, count * sizeof(TYPE)); \ } else { \ - /* source or destination are non-contigous */ \ + /* source or destination are non-contiguous */ \ for (i = 0; i < count; i++) { \ MEMCPY(to, from, sizeof(TYPE)); \ to += to_extent; \ @@ -265,10 +265,10 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons } \ } else if ((ptrdiff_t)(sizeof(TYPE1) + sizeof(TYPE2)) == to_extent \ && (ptrdiff_t)(sizeof(TYPE1) + sizeof(TYPE2)) == from_extent) { \ - /* source and destination are contigous */ \ + /* source and destination are contiguous */ \ MEMCPY(to, from, count *(sizeof(TYPE1) + sizeof(TYPE2))); \ } else { \ - /* source or destination are non-contigous */ \ + /* source or destination are non-contiguous */ \ for (i = 0; i < count; i++) { \ MEMCPY(to, from, sizeof(TYPE1) + sizeof(TYPE2)); \ to += to_extent; \ @@ -457,32 +457,255 @@ COPY_2SAMETYPE_HETEROGENEOUS_INTERNAL(long_double_complex, long double, 1) COPY_TYPE_HETEROGENEOUS(wchar, wchar_t) +#if SIZEOF_LONG == 8 +static int32_t +copy_long_heterogeneous(opal_convertor_t *pConvertor, size_t count, + const char* from, size_t from_len, ptrdiff_t from_extent, + char* to, size_t to_length, ptrdiff_t to_extent, + ptrdiff_t *advance) +{ + size_t i; + + datatype_check("long", sizeof(long), pConvertor->master->remote_sizes[OPAL_DATATYPE_LONG], &count, from, from_len, from_extent, to, + to_length, to_extent); + if (!((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_LONGIS64)) { /* same sizeof(long) */ + if ((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_ISBIGENDIAN) { /* different endianess */ + for (i = 0; i < count; i++) { + opal_dt_swap_bytes(to, from, sizeof(long), 1); + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { + *(long*)to = *(long*)from; + to += to_extent; + from += from_extent; + } + } + } else { + /* the two sides have different lengths for sizeof(long) */ + if( CONVERTOR_SEND & pConvertor->flags ) { /* we're doing a pack */ + assert(CONVERTOR_SEND_CONVERSION & pConvertor->flags); + if ((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_ISBIGENDIAN) { + /* different sizeof, we need to convert */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + int64_t val = *(int64_t*)from; + int32_t i32 = (int32_t)val; + opal_dt_swap_bytes(to, &i32, sizeof(int32_t), 1); + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + int32_t val = *(int32_t*)from; + int64_t i64 = (int64_t)val; + opal_dt_swap_bytes(to, &i64, sizeof(int64_t), 1); + to += to_extent; + from += from_extent; + } + } + } else { /* both have the same endianess */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + long val = *(long*)from; + *(int32_t*)to = (int32_t)val; + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + long val = *(long*)from; + *(int64_t*)to = (int64_t)val; + to += to_extent; + from += from_extent; + } + } + } + } else { /* unpack */ + if ((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_ISBIGENDIAN) { + /* different endianness */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + int32_t val; + opal_dt_swap_bytes(&val, from, sizeof(int32_t), 1); + *(long*)to = (long)val; + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + int64_t val; + opal_dt_swap_bytes(&val, from, sizeof(int64_t), 1); + *(long*)to = (long)val; + to += to_extent; + from += from_extent; + } + } + } else { /* both have the same endianess */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + int32_t val = *(int32_t*)from; + *(long*)to = (long)val; + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + int64_t val = *(int64_t*)from; + *(long*)to = (long)val; + to += to_extent; + from += from_extent; + } + } + } + } + } + *advance = count * from_extent; + return count; +} + +static int32_t +copy_unsigned_long_heterogeneous(opal_convertor_t *pConvertor, size_t count, + const char* from, size_t from_len, ptrdiff_t from_extent, + char* to, size_t to_length, ptrdiff_t to_extent, + ptrdiff_t *advance) +{ + size_t i; + + datatype_check("unsigned long", sizeof(unsigned long), pConvertor->master->remote_sizes[OPAL_DATATYPE_UNSIGNED_LONG], + &count, from, from_len, from_extent, to, to_length, to_extent); + if (!((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_LONGIS64)) { /* same sizeof(long) */ + if ((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_ISBIGENDIAN) { /* different endianess */ + for (i = 0; i < count; i++) { + opal_dt_swap_bytes(to, from, sizeof(unsigned long), 1); + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { + *(unsigned long*)to = *(unsigned long*)from; + to += to_extent; + from += from_extent; + } + } + } else { + /* the two sides have different lengths for sizeof(long) */ + if( CONVERTOR_SEND & pConvertor->flags ) { /* we're doing a pack */ + assert(CONVERTOR_SEND_CONVERSION & pConvertor->flags); + if ((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_ISBIGENDIAN) { + /* different sizeof, we need to convert */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + uint64_t val = *(uint64_t*)from; + uint32_t i32 = (uint32_t)val; + opal_dt_swap_bytes(to, &i32, sizeof(uint32_t), 1); + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + uint32_t val = *(uint32_t*)from; + uint64_t i64 = (uint64_t)val; + opal_dt_swap_bytes(to, &i64, sizeof(uint64_t), 1); + to += to_extent; + from += from_extent; + } + } + } else { /* both have the same endianess */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + unsigned long val = *(unsigned long*)from; + *(uint32_t*)to = (uint32_t)val; + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + unsigned long val = *(unsigned long*)from; + *(uint64_t*)to = (uint64_t)val; + to += to_extent; + from += from_extent; + } + } + } + } else { /* unpack */ + if ((pConvertor->remoteArch ^ opal_local_arch) & OPAL_ARCH_ISBIGENDIAN) { + /* different endianness */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + uint32_t val; + opal_dt_swap_bytes(&val, from, sizeof(uint32_t), 1); + *(unsigned long*)to = (unsigned long)val; + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + uint64_t val; + opal_dt_swap_bytes(&val, from, sizeof(uint64_t), 1); + *(unsigned long*)to = (unsigned long)val; + to += to_extent; + from += from_extent; + } + } + } else { /* both have the same endianess */ + if (opal_local_arch & OPAL_ARCH_LONGIS64) { + for (i = 0; i < count; i++) { /* from 8 to 4 bytes */ + uint32_t val = *(uint32_t*)from; + *(unsigned long*)to = (unsigned long)val; + to += to_extent; + from += from_extent; + } + } else { + for (i = 0; i < count; i++) { /* from 4 to 8 bytes */ + uint64_t val = *(uint64_t*)from; + *(unsigned long*)to = (unsigned long)val; + to += to_extent; + from += from_extent; + } + } + } + } + } + *advance = count * from_extent; + return count; +} +#endif /* SIZEOF_LONG == 8 */ + /* table of predefined copy functions - one for each MPI type */ conversion_fct_t opal_datatype_heterogeneous_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED] = { - [OPAL_DATATYPE_LOOP] = NULL, - [OPAL_DATATYPE_END_LOOP] = NULL, - [OPAL_DATATYPE_LB] = NULL, - [OPAL_DATATYPE_UB] = NULL, - [OPAL_DATATYPE_INT1] = (conversion_fct_t) copy_int1_heterogeneous, - [OPAL_DATATYPE_INT2] = (conversion_fct_t) copy_int2_heterogeneous, - [OPAL_DATATYPE_INT4] = (conversion_fct_t) copy_int4_heterogeneous, - [OPAL_DATATYPE_INT8] = (conversion_fct_t) copy_int8_heterogeneous, - [OPAL_DATATYPE_INT16] = (conversion_fct_t) copy_int16_heterogeneous, - [OPAL_DATATYPE_UINT1] = (conversion_fct_t) copy_int1_heterogeneous, - [OPAL_DATATYPE_UINT2] = (conversion_fct_t) copy_int2_heterogeneous, - [OPAL_DATATYPE_UINT4] = (conversion_fct_t) copy_int4_heterogeneous, - [OPAL_DATATYPE_UINT8] = (conversion_fct_t) copy_int8_heterogeneous, - [OPAL_DATATYPE_UINT16] = (conversion_fct_t) copy_int16_heterogeneous, - [OPAL_DATATYPE_FLOAT2] = (conversion_fct_t) copy_float2_heterogeneous, - [OPAL_DATATYPE_FLOAT4] = (conversion_fct_t) copy_float4_heterogeneous, - [OPAL_DATATYPE_FLOAT8] = (conversion_fct_t) copy_float8_heterogeneous, - [OPAL_DATATYPE_FLOAT12] = (conversion_fct_t) copy_float12_heterogeneous, - [OPAL_DATATYPE_FLOAT16] = (conversion_fct_t) copy_float16_heterogeneous, + [OPAL_DATATYPE_LOOP] = NULL, + [OPAL_DATATYPE_END_LOOP] = NULL, + [OPAL_DATATYPE_LB] = NULL, + [OPAL_DATATYPE_UB] = NULL, + [OPAL_DATATYPE_INT1] = (conversion_fct_t) copy_int1_heterogeneous, + [OPAL_DATATYPE_INT2] = (conversion_fct_t) copy_int2_heterogeneous, + [OPAL_DATATYPE_INT4] = (conversion_fct_t) copy_int4_heterogeneous, + [OPAL_DATATYPE_INT8] = (conversion_fct_t) copy_int8_heterogeneous, + [OPAL_DATATYPE_INT16] = (conversion_fct_t) copy_int16_heterogeneous, + [OPAL_DATATYPE_UINT1] = (conversion_fct_t) copy_int1_heterogeneous, + [OPAL_DATATYPE_UINT2] = (conversion_fct_t) copy_int2_heterogeneous, + [OPAL_DATATYPE_UINT4] = (conversion_fct_t) copy_int4_heterogeneous, + [OPAL_DATATYPE_UINT8] = (conversion_fct_t) copy_int8_heterogeneous, + [OPAL_DATATYPE_UINT16] = (conversion_fct_t) copy_int16_heterogeneous, + [OPAL_DATATYPE_FLOAT2] = (conversion_fct_t) copy_float2_heterogeneous, + [OPAL_DATATYPE_FLOAT4] = (conversion_fct_t) copy_float4_heterogeneous, + [OPAL_DATATYPE_FLOAT8] = (conversion_fct_t) copy_float8_heterogeneous, + [OPAL_DATATYPE_FLOAT12] = (conversion_fct_t) copy_float12_heterogeneous, + [OPAL_DATATYPE_FLOAT16] = (conversion_fct_t) copy_float16_heterogeneous, [OPAL_DATATYPE_SHORT_FLOAT_COMPLEX] = (conversion_fct_t) copy_short_float_complex_heterogeneous, - [OPAL_DATATYPE_FLOAT_COMPLEX] = (conversion_fct_t) copy_float_complex_heterogeneous, - [OPAL_DATATYPE_DOUBLE_COMPLEX] = (conversion_fct_t) copy_double_complex_heterogeneous, + [OPAL_DATATYPE_FLOAT_COMPLEX] = (conversion_fct_t) copy_float_complex_heterogeneous, + [OPAL_DATATYPE_DOUBLE_COMPLEX] = (conversion_fct_t) copy_double_complex_heterogeneous, [OPAL_DATATYPE_LONG_DOUBLE_COMPLEX] = (conversion_fct_t) copy_long_double_complex_heterogeneous, - [OPAL_DATATYPE_BOOL] = (conversion_fct_t) copy_cxx_bool_heterogeneous, - [OPAL_DATATYPE_WCHAR] = (conversion_fct_t) copy_wchar_heterogeneous, - [OPAL_DATATYPE_UNAVAILABLE] = NULL, + [OPAL_DATATYPE_BOOL] = (conversion_fct_t) copy_cxx_bool_heterogeneous, + [OPAL_DATATYPE_WCHAR] = (conversion_fct_t) copy_wchar_heterogeneous, +#if SIZEOF_LONG == 4 + [OPAL_DATATYPE_LONG] = (conversion_fct_t) copy_int4_heterogeneous, + [OPAL_DATATYPE_UNSIGNED_LONG] = (conversion_fct_t) copy_int4_heterogeneous, +#else + [OPAL_DATATYPE_LONG] = (conversion_fct_t) copy_long_heterogeneous, + [OPAL_DATATYPE_UNSIGNED_LONG] = (conversion_fct_t) copy_unsigned_long_heterogeneous, +#endif + [OPAL_DATATYPE_UNAVAILABLE] = NULL, }; diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index 41da0f40d35..7dabd1742c0 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -52,16 +52,15 @@ BEGIN_C_DECLS * This must match the same definition as in opal_datatype_internal.h */ #if !defined(OPAL_DATATYPE_MAX_PREDEFINED) -# define OPAL_DATATYPE_MAX_PREDEFINED 26 +# define OPAL_DATATYPE_MAX_PREDEFINED 28 #endif /* - * No more than this number of _Basic_ datatypes in C/CPP or Fortran - * are supported (in order to not change setup and usage of the predefined - * datatypes). + * Upper limit of the number of _Basic_ datatypes supported (in order to + * not change setup and usage of the predefined datatypes). * * BEWARE: This constant should reflect whatever the OMPI-layer needs. */ -#define OPAL_DATATYPE_MAX_SUPPORTED 50 +#define OPAL_DATATYPE_MAX_SUPPORTED 64 /* flags for the datatypes. */ #define OPAL_DATATYPE_FLAG_UNAVAILABLE \ @@ -84,6 +83,15 @@ BEGIN_C_DECLS #define OPAL_DATATYPE_FLAG_BASIC \ (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS \ | OPAL_DATATYPE_FLAG_DATA | OPAL_DATATYPE_FLAG_COMMITTED) +/* + * If during the datatype optimization process we collapse contiguous elements with + * different types, we cannot use this optimized description for any communication + * in a heterogeneous setting, especially not for the exteranl32 support. + * + * A datatype with this flag cannot use the optimized description in heterogeneous + * setups. + */ +#define OPAL_DATATYPE_OPTIMIZED_RESTRICTED 0x1000 /** * The number of supported entries in the data-type definition and the @@ -178,6 +186,8 @@ OPAL_DECLSPEC extern const opal_datatype_t opal_datatype_double_complex; OPAL_DECLSPEC extern const opal_datatype_t opal_datatype_long_double_complex; OPAL_DECLSPEC extern const opal_datatype_t opal_datatype_bool; OPAL_DECLSPEC extern const opal_datatype_t opal_datatype_wchar; +OPAL_DECLSPEC extern const opal_datatype_t opal_datatype_long; +OPAL_DECLSPEC extern const opal_datatype_t opal_datatype_unsigned_long; /* * Functions exported externally diff --git a/opal/datatype/opal_datatype_dump.c b/opal/datatype/opal_datatype_dump.c index ca06a79307a..259dcc3152f 100644 --- a/opal/datatype/opal_datatype_dump.c +++ b/opal/datatype/opal_datatype_dump.c @@ -99,6 +99,8 @@ int opal_datatype_dump_data_flags(unsigned short usflags, char *ptr, size_t leng } if ((usflags & OPAL_DATATYPE_FLAG_BASIC) == OPAL_DATATYPE_FLAG_BASIC) { ptr[9] = 'B'; + } else if (usflags & OPAL_DATATYPE_OPTIMIZED_RESTRICTED) { + ptr[9] = 'H'; /* optimized description restricted to homogeneous cases */ } /* We know nothing about the upper level language or flags! */ /* ... */ diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index b3ec1de894b..c50f4501393 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -61,7 +61,9 @@ extern int opal_datatype_dfd; * * At the OPAL-level we do not care from which language the datatype came from * (C, C++ or FORTRAN), we only focus on their internal representation in - * the host memory. + * the host memory. There is one notable exception, the long predefined type + * which need to be handled at the lowest level due to it's variable size but + * fixed XDR representation. * * NOTE: This predefined datatype order should be matched by any upper-level * users of the OPAL datatype. @@ -92,7 +94,9 @@ extern int opal_datatype_dfd; #define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 22 #define OPAL_DATATYPE_BOOL 23 #define OPAL_DATATYPE_WCHAR 24 -#define OPAL_DATATYPE_UNAVAILABLE 25 +#define OPAL_DATATYPE_LONG 25 +#define OPAL_DATATYPE_UNSIGNED_LONG 26 +#define OPAL_DATATYPE_UNAVAILABLE 27 #ifndef OPAL_DATATYPE_MAX_PREDEFINED # define OPAL_DATATYPE_MAX_PREDEFINED (OPAL_DATATYPE_UNAVAILABLE + 1) @@ -381,6 +385,12 @@ struct opal_datatype_t; # define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS) NOTAV(INT16, FLAGS) #endif + +#define OPAL_DATATYPE_INITIALIZER_LONG(FLAGS) \ + OPAL_DATATYPE_INIT_BASIC_DATATYPE(long, OPAL_ALIGNMENT_LONG, LONG, FLAGS) +#define OPAL_DATATYPE_INITIALIZER_UNSIGNED_LONG(FLAGS) \ + OPAL_DATATYPE_INIT_BASIC_DATATYPE(unsigned long, OPAL_ALIGNMENT_LONG, UNSIGNED_LONG, FLAGS) + #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 2 # define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS) \ AV(short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS) @@ -497,7 +507,8 @@ struct opal_datatype_t; #define OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) \ AV(long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS) -#define OPAL_DATATYPE_HANDLE_BOOL(AV, NOTAV, FLAGS) AV(_Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS) +#define OPAL_DATATYPE_HANDLE_BOOL(AV, NOTAV, FLAGS) \ + AV(_Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS) #if OPAL_ALIGNMENT_WCHAR != 0 # define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS) \ diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index d8bc8f9d47c..26059b28472 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -88,6 +88,8 @@ OPAL_DECLSPEC const opal_datatype_t opal_datatype_long_double_complex = OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(0); OPAL_DECLSPEC const opal_datatype_t opal_datatype_bool = OPAL_DATATYPE_INITIALIZER_BOOL(0); OPAL_DECLSPEC const opal_datatype_t opal_datatype_wchar = OPAL_DATATYPE_INITIALIZER_WCHAR(0); +OPAL_DECLSPEC const opal_datatype_t opal_datatype_long = OPAL_DATATYPE_INITIALIZER_LONG(0); +OPAL_DECLSPEC const opal_datatype_t opal_datatype_unsigned_long = OPAL_DATATYPE_INITIALIZER_UNSIGNED_LONG(0); OPAL_DECLSPEC const opal_datatype_t opal_datatype_unavailable = OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED(UNAVAILABLE, 0); @@ -126,6 +128,8 @@ OPAL_DECLSPEC const size_t opal_datatype_local_sizes[OPAL_DATATYPE_MAX_PREDEFINE [OPAL_DATATYPE_LONG_DOUBLE_COMPLEX] = sizeof(long double _Complex), [OPAL_DATATYPE_BOOL] = sizeof(_Bool), [OPAL_DATATYPE_WCHAR] = sizeof(wchar_t), + [OPAL_DATATYPE_LONG] = sizeof(long), + [OPAL_DATATYPE_UNSIGNED_LONG] = sizeof(unsigned long), }; /* @@ -160,6 +164,8 @@ OPAL_DECLSPEC const opal_datatype_t *opal_datatype_basicDatatypes[OPAL_DATATYPE_ [OPAL_DATATYPE_LONG_DOUBLE_COMPLEX] = &opal_datatype_long_double_complex, [OPAL_DATATYPE_BOOL] = &opal_datatype_bool, [OPAL_DATATYPE_WCHAR] = &opal_datatype_wchar, + [OPAL_DATATYPE_LONG] = &opal_datatype_long, + [OPAL_DATATYPE_UNSIGNED_LONG] = &opal_datatype_unsigned_long, [OPAL_DATATYPE_UNAVAILABLE] = &opal_datatype_unavailable, }; @@ -258,7 +264,7 @@ int32_t opal_datatype_init(void) int32_t i; /** - * Force he initialization of the opal_datatype_t class. This will allow us to + * Force the initialization of the opal_datatype_t class. This will allow us to * call OBJ_DESTRUCT without going too deep in the initialization process. */ opal_class_initialize(OBJ_CLASS(opal_datatype_t)); diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c index de6c3eb6560..dbf7aeb86a3 100644 --- a/opal/datatype/opal_datatype_optimize.c +++ b/opal/datatype/opal_datatype_optimize.c @@ -94,7 +94,9 @@ static int32_t opal_datatype_optimize_short(opal_datatype_t *pData, size_t count assert(1 == current->count); if ((current->common.type == OPAL_DATATYPE_LOOP) || compress.common.type != current->common.type) { - compress.common.type = OPAL_DATATYPE_UINT1; + compress.common.type = OPAL_DATATYPE_UINT1; + compress.common.flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; + pData->flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; compress.blocklen = end_loop->size; break; } @@ -194,7 +196,9 @@ static int32_t opal_datatype_optimize_short(opal_datatype_t *pData, size_t count ddt_elem_desc_t save = last; /* safekeep the type and blocklen */ if (last.common.type != current->common.type) { last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; - last.common.type = OPAL_DATATYPE_UINT1; + last.common.type = OPAL_DATATYPE_UINT1; + last.common.flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; + pData->flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; } if ((last.extent * (ptrdiff_t) last.count + last.disp) == current->disp) { @@ -248,7 +252,9 @@ static int32_t opal_datatype_optimize_short(opal_datatype_t *pData, size_t count + (current->blocklen * opal_datatype_basicDatatypes[current->common.type] ->size)); - last.common.type = OPAL_DATATYPE_UINT1; + last.common.type = OPAL_DATATYPE_UINT1; + last.common.flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; + pData->flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; } last.extent += current->extent; if (current->count != 1) { diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index 51bf0fecec4..d1b9e1a538a 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -375,7 +375,7 @@ int32_t opal_generic_simple_pack_function(opal_convertor_t *pConvertor, struct i *max_data = total_packed; pConvertor->bConverted += total_packed; /* update the already converted bytes */ *out_size = iov_count; - if (pConvertor->bConverted == pConvertor->local_size) { + if (pConvertor->bConverted == pConvertor->remote_size) { pConvertor->flags |= CONVERTOR_COMPLETED; return 1; } @@ -398,47 +398,88 @@ int32_t opal_generic_simple_pack_function(opal_convertor_t *pConvertor, struct i * to a contiguous output buffer with a predefined size. * return OPAL_SUCCESS if everything went OK and if there is still room before the complete * conversion of the data (need additional call with others input buffers ) - * 1 if everything went fine and the data was completly converted + * 1 if everything went fine and the data was completely converted * -1 something wrong occurs. */ -static inline void pack_predefined_heterogeneous(opal_convertor_t *CONVERTOR, - const dt_elem_desc_t *ELEM, size_t *COUNT, - unsigned char **SOURCE, - unsigned char **DESTINATION, size_t *SPACE) +static inline void +pack_predefined_heterogeneous(opal_convertor_t *CONVERTOR, + const dt_elem_desc_t *ELEM, size_t *COUNT, + unsigned char **memory, + unsigned char **packed, size_t *SPACE) { const opal_convertor_master_t *master = (CONVERTOR)->master; const ddt_elem_desc_t *_elem = &((ELEM)->elem); - unsigned char *_source = (*SOURCE) + _elem->disp; - ptrdiff_t advance; - size_t _count = *(COUNT); - size_t _r_blength; - - _r_blength = master->remote_sizes[_elem->common.type]; - if ((_count * _r_blength) > *(SPACE)) { - _count = (*(SPACE) / _r_blength); - if (0 == _count) { - return; /* nothing to do */ - } + size_t cando_count = *(COUNT), do_now_bytes; + size_t local_elem_size = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t remote_elem_size = master->remote_sizes[_elem->common.type]; + size_t blocklen_bytes = remote_elem_size; + unsigned char *_memory = (*memory) + _elem->disp; + unsigned char *_packed = *packed; + ptrdiff_t advance = 0; + + assert(0 == (cando_count % _elem->blocklen)); /* no partials here */ + assert(*(COUNT) <= ((size_t) _elem->count * _elem->blocklen)); + + if ((remote_elem_size * cando_count) > *(SPACE)) + cando_count = (*SPACE) / blocklen_bytes; + + /* preemptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if (_elem->blocklen == 1) { + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _memory, *SPACE, _elem->extent, + _packed, *SPACE, remote_elem_size, + &advance); + _memory += cando_count * _elem->extent; + _packed += cando_count * remote_elem_size; + goto update_and_return; + } + + if ((1 < _elem->count) && (_elem->blocklen <= cando_count)) { + blocklen_bytes = remote_elem_size * _elem->blocklen; + + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void *) _packed, (void *) _memory, (unsigned long) blocklen_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, _elem->blocklen, + _memory, *SPACE, local_elem_size, + _packed, *SPACE, remote_elem_size, + &advance); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); + } + + /** + * As an epilog do anything left from the last blocklen. + */ + if (0 != cando_count) { + assert((cando_count < _elem->blocklen) + || ((1 == _elem->count) && (cando_count <= _elem->blocklen))); + do_now_bytes = cando_count * remote_elem_size; + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void *) _packed, (void *) _memory, (unsigned long) do_now_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _memory, *SPACE, local_elem_size, + _packed, *SPACE, remote_elem_size, + &advance); + _memory += cando_count * local_elem_size; + _packed += do_now_bytes; } - OPAL_DATATYPE_SAFEGUARD_POINTER(_source, (_count * _elem->extent), (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count); - DO_DEBUG(opal_output(0, "pack [l %s r %s] memcpy( %p, %p, %lu ) => space %lu\n", - ((ptrdiff_t)(opal_datatype_basicDatatypes[_elem->common.type]->size) - == _elem->extent) - ? "cont" - : "----", - ((ptrdiff_t) _r_blength == _elem->extent) ? "cont" : "----", - (void *) *(DESTINATION), (void *) _source, (unsigned long) _r_blength, - (unsigned long) (*(SPACE)));); - master->pFunctions[_elem->common.type](CONVERTOR, _count, _source, *SPACE, _elem->extent, - *DESTINATION, *SPACE, _r_blength, &advance); - _r_blength *= _count; /* update the remote length to encompass all the elements */ - *(SOURCE) += _count * _elem->extent; - *(DESTINATION) += _r_blength; - *(SPACE) -= _r_blength; - *(COUNT) -= _count; +update_and_return: + *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } int32_t opal_pack_general_function(opal_convertor_t *pConvertor, struct iovec *iov, @@ -574,7 +615,9 @@ int32_t opal_pack_general_function(opal_convertor_t *pConvertor, struct iovec *i *max_data = total_packed; pConvertor->bConverted += total_packed; /* update the already converted bytes */ *out_size = iov_count; - if (pConvertor->bConverted == pConvertor->local_size) { + size_t expected_packed_size; + opal_convertor_get_packed_size(pConvertor, &expected_packed_size); + if (pConvertor->bConverted == expected_packed_size) { pConvertor->flags |= CONVERTOR_COMPLETED; return 1; } diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index 7ac93582d9d..629c7762ec0 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -222,7 +222,7 @@ static inline void opal_unpack_partial_datatype(opal_convertor_t *pConvertor, dt /* reload the length as it is reset by the macro */ data_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; - /* For every occurence of the unused byte move data from the saved + /* For every occurrence of the unused byte move data from the saved * buffer back into the user memory. */ #if OPAL_CUDA_SUPPORT @@ -422,7 +422,7 @@ int32_t opal_generic_simple_unpack_function(opal_convertor_t *pConvertor, struct *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; - if (pConvertor->bConverted == pConvertor->remote_size) { + if (pConvertor->bConverted == pConvertor->local_size) { pConvertor->flags |= CONVERTOR_COMPLETED; return 1; } @@ -446,9 +446,89 @@ int32_t opal_generic_simple_unpack_function(opal_convertor_t *pConvertor, struct * to a contiguous output buffer with a predefined size. * return OPAL_SUCCESS if everything went OK and if there is still room before the complete * conversion of the data (need additional call with others input buffers ) - * 1 if everything went fine and the data was completly converted + * 1 if everything went fine and the data was completely converted * -1 something wrong occurs. */ +static inline void +unpack_predefined_heterogeneous(opal_convertor_t *CONVERTOR, + const dt_elem_desc_t *ELEM, size_t *COUNT, + unsigned char **memory, + unsigned char **packed, size_t *SPACE) +{ + const opal_convertor_master_t *master = (CONVERTOR)->master; + const ddt_elem_desc_t *_elem = &((ELEM)->elem); + size_t cando_count = *(COUNT), do_now_bytes; + size_t local_elem_size = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t remote_elem_size = master->remote_sizes[_elem->common.type]; + size_t blocklen_bytes = remote_elem_size; + unsigned char *_memory = (*memory) + _elem->disp; + unsigned char *_packed = *packed; + ptrdiff_t advance = 0; + + assert(0 == (cando_count % _elem->blocklen)); /* no partials here */ + assert(*(COUNT) <= ((size_t) _elem->count * _elem->blocklen)); + + if ((remote_elem_size * cando_count) > *(SPACE)) + cando_count = (*SPACE) / blocklen_bytes; + + /* preemptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if (_elem->blocklen == 1) { + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _packed, *SPACE, remote_elem_size, + _memory, *SPACE, _elem->extent, + &advance); + _memory += cando_count * _elem->extent; + _packed += cando_count * remote_elem_size; + goto update_and_return; + } + + if ((1 < _elem->count) && (_elem->blocklen <= cando_count)) { + blocklen_bytes = remote_elem_size * _elem->blocklen; + + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void *) _packed, (void *) _memory, (unsigned long) blocklen_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, _elem->blocklen, + _packed, *SPACE, remote_elem_size, + _memory, *SPACE, local_elem_size, + &advance); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); + } + + /** + * As an epilog do anything left from the last blocklen. + */ + if (0 != cando_count) { + assert((cando_count < _elem->blocklen) + || ((1 == _elem->count) && (cando_count <= _elem->blocklen))); + do_now_bytes = cando_count * remote_elem_size; + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void *) _packed, (void *) _memory, (unsigned long) do_now_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _packed, *SPACE, remote_elem_size, + _memory, *SPACE, local_elem_size, + &advance); + _memory += cando_count * local_elem_size; + _packed += do_now_bytes; + } + +update_and_return: + *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; +} + int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec *iov, uint32_t *out_size, size_t *max_data) { @@ -463,9 +543,10 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec unsigned char *conv_ptr, *iov_ptr; uint32_t iov_count; size_t iov_len_local; - +#if 0 const opal_convertor_master_t *master = pConvertor->master; ptrdiff_t advance; /* number of bytes that we should advance the buffer */ +#endif size_t rc; DO_DEBUG(opal_output(0, "opal_convertor_general_unpack( %p, {%p, %lu}, %d )\n", @@ -509,15 +590,8 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec conv_ptr + pElem->elem.disp - pConvertor->pBaseBuf, count_desc, description[pos_desc].elem.extent, opal_datatype_basicDatatypes[type]->name);); - rc = master->pFunctions[type](pConvertor, count_desc, iov_ptr, iov_len_local, - opal_datatype_basicDatatypes[type]->size, - conv_ptr + pElem->elem.disp, - (pConvertor->pDesc->ub - pConvertor->pDesc->lb) - * pConvertor->count, - description[pos_desc].elem.extent, &advance); - iov_len_local -= advance; /* decrease the available space in the buffer */ - iov_ptr += advance; /* increase the pointer to the buffer */ - count_desc -= rc; /* compute leftovers */ + unpack_predefined_heterogeneous(pConvertor, pElem, &count_desc, &conv_ptr, &iov_ptr, + &iov_len_local); if (0 == count_desc) { /* completed */ conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ @@ -527,7 +601,6 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec } continue; } - conv_ptr += rc * description[pos_desc].elem.extent; assert(pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED); assert(0 == iov_len_local); if (0 != iov_len_local) { @@ -535,8 +608,7 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. */ - assert(iov_len_local - < opal_datatype_basicDatatypes[pElem->elem.common.type]->size); + assert(iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size); COMPUTE_CSUM(iov_ptr, iov_len_local, pConvertor); opal_unpack_partial_datatype(pConvertor, pElem, iov_ptr, 0, iov_len_local, @@ -596,7 +668,9 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; - if (pConvertor->bConverted == pConvertor->remote_size) { + size_t expected_packed_size; + opal_convertor_get_packed_size(pConvertor, &expected_packed_size); + if (pConvertor->bConverted == expected_packed_size) { pConvertor->flags |= CONVERTOR_COMPLETED; return 1; } diff --git a/opal/util/arch.h b/opal/util/arch.h index 115cd65af69..78a394651b8 100644 --- a/opal/util/arch.h +++ b/opal/util/arch.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -51,7 +51,7 @@ ** The fortran integer is dismissed here, since there is no ** platform known to me, were fortran and C-integer do not match ** -** The following abbriviations are introduced: +** The following abbreviations are introduced: ** ** a) il32 (int long are 32 bits) (e.g. IA32 LINUX, SGI n32, SUN) ** diff --git a/test/datatype/external32.c b/test/datatype/external32.c index d09938510ba..9d47e60950a 100644 --- a/test/datatype/external32.c +++ b/test/datatype/external32.c @@ -33,13 +33,29 @@ int check_vector( void* send_buffer, void* packed, static int pack_unpack_datatype( void* send_data, ompi_datatype_t *datatype, int count, void* recv_data, checker_t validator, void *validator_arg ); -static void dump_hex(void* what, size_t length); - -static void dump_hex(void* what, size_t length) +static void +dump_hex(const char* msg, const void* vbuf, int nbytes, + int start_from, int stop_at, int vals_per_line) { - size_t i; - for( i = 0; i < length; i++ ) { - printf("%02x", (unsigned int)(((unsigned char*)what)[i])); + const char* buf = (const char*)vbuf; + + if( -1 == stop_at ) stop_at = nbytes; + + for (int i = (start_from / vals_per_line) * vals_per_line; i < nbytes; ++i) { + if( i >= stop_at ) return; + if (0 == (i % vals_per_line)) { + if( NULL == msg) printf("\n"); + else printf("\n%s", msg); + } else { + if (i % 4 == 0) { + printf(" "); + } + } + printf(" "); + if( i < start_from ) + printf(" "); + else + printf("%02x", *((unsigned char *)(buf + i))); } } @@ -131,7 +147,8 @@ static int pack_unpack_datatype( void* send_data, ompi_datatype_t *datatype, int return -1; } - printf("packed %ld bytes into a %ld bytes buffer ", position, buffer_size); dump_hex(buffer, position); printf("\n"); + printf("packed %ld bytes into a %ld bytes buffer ", position, buffer_size); + dump_hex(NULL, buffer, position, 0, -1, 24); printf("\n"); position = 0; error = ompi_datatype_unpack_external("external32", buffer, buffer_size, &position, @@ -155,12 +172,14 @@ int main(int argc, char *argv[]) if( verbose ) { printf("send data %08x %08x \n", send_data[0], send_data[1]); - printf("data "); dump_hex(&send_data, sizeof(int32_t) * 2); printf("\n"); + printf("data "); + dump_hex(NULL, &send_data, sizeof(int32_t) * 2, 0, -1, 24); printf("\n"); } (void)pack_unpack_datatype( send_data, &ompi_mpi_int32_t.dt, 2, recv_data, check_contiguous, (void*)&ompi_mpi_int32_t.dt ); if( verbose ) { - printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 2); printf("\n"); + printf("recv "); + dump_hex(NULL, &recv_data, sizeof(int32_t) * 2, 0, -1, 24); printf("\n"); printf("recv data %08x %08x \n", recv_data[0], recv_data[1]); } if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) { @@ -175,12 +194,14 @@ int main(int argc, char *argv[]) if( verbose ) { printf("send data %08x %08x \n", send_data[0], send_data[1]); - printf("data "); dump_hex(&send_data, sizeof(int16_t) * 2); printf("\n"); + printf("data "); + dump_hex(NULL, &send_data, sizeof(int16_t) * 2, 0, -1, 24); printf("\n"); } (void)pack_unpack_datatype( send_data, &ompi_mpi_int16_t.dt, 2, recv_data, check_contiguous, (void*)&ompi_mpi_int16_t.dt ); if( verbose ) { - printf("recv "); dump_hex(&recv_data, sizeof(int16_t) * 2); printf("\n"); + printf("recv "); + dump_hex(NULL, &recv_data, sizeof(int16_t) * 2, 0, -1, 24); printf("\n"); printf("recv data %08x %08x \n", recv_data[0], recv_data[1]); } if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) { @@ -208,16 +229,18 @@ int main(int argc, char *argv[]) if( verbose ) { printf("send data %08x %x08x %08x \n", send_data[0], send_data[1], send_data[2]); - printf("data "); dump_hex(&send_data, sizeof(int32_t) * 3); printf("\n"); + printf("data "); dump_hex(NULL, &send_data, sizeof(int32_t) * 3, 0, -1, 24); printf("\n"); } (void)pack_unpack_datatype( send_data, ddt, 1, recv_data, check_vector, (void*)&ompi_mpi_int32_t.dt ); if( verbose ) { - printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 3); printf("\n"); + printf("recv "); dump_hex(NULL, &recv_data, sizeof(int32_t) * 3, 0, -1, 24); printf("\n"); printf("recv data %08x %08x %08x \n", recv_data[0], recv_data[1], recv_data[2]); } ompi_datatype_destroy(&ddt); if( (send_data[0] != recv_data[0]) || (send_data[2] != recv_data[2]) ) { printf("Error during external32 pack/unack for vector types (MPI_INT32_T)\n"); + printf("[0]: %d ? %d | [2]: %d ? %d ([1]: %d ? %d)\n", send_data[0], recv_data[0], + send_data[2], recv_data[2], send_data[1], recv_data[1]); exit(-1); } }