From 18777108ea2de44a8e6e9faaded91723aa6d6624 Mon Sep 17 00:00:00 2001
From: Mark Allen <markalle@us.ibm.com>
Date: Wed, 22 Jul 2020 00:28:56 -0400
Subject: [PATCH 1/3] adding .ompi_id field to opal datatypes for predefined
 types

external32 requires us to distinguish MPI_LONG from MPI_INT64_T
but the prior to this commit the two MPI types ompi_mpi_long and
ompi_mpi_int64_t are constructed with the same OPAL initialization
referring to the same OPAL_DATATYPE_INT8 for the opal .id field
and for the .common.type in the description entry.

This commit adds an .ompi_id field (only for the predefined
OMPI types) to both the opal datatype and to the description
at desc[].elem.common.ompi_id.  OPAL only needs to know how
many there are when it walks the arrays, so the actual values
in .ompi_id remain opaque to it.  The .ompi_id isn't required
to be set, as opal datatypes don't have to be associated with
an OMPI datatype.

The macros set .dt.super.ompi_id for the predefined types,
but there are extra steps to reset the .desc entry's .ompi_id
since the initial construction via the opal macros points
many .desc entries at the same place based on its underlying
opal type.  Since the point is to let MPI_LONG's .desc diverge
from MPI_INT64_T's for example, the .desc entries had to be
reset.

Former datatypes constants with the same value like
    OMPI_DATATYPE_MPI_LONG
    OMPI_DATATYPE_MPI_UINT64_T
were separated of course, but cases like
    OMPI_DATATYPE_MPI_C_DOUBLE_COMPLEX
    OMPI_DATATYPE_MPI_CXX_DOUBLE_COMPLEX
are also being separated for a more implementation-specific reason
rather than it being logically necessary.  The opal datatype
initialization sets the .desc field statically at shared offsets
into shared opal_datatype_predefined_elem_desc[].  So later when
I want to set an .ompi_id in .desc[] I have to privatize everything.
In this example "everything" means both
    ompi_mpi_c_double_complex
    ompi_mpi_cxx_dblcplex
but putting both of those into ompi_datatype_basicDatatypes[]
requires separating their constants.

Other changes required to support this:

.ptypes:

opal datatypes have a .ptypes field that previously was just an
array counting how many of each base OPAL type was contained, and
this was used to figure out the size of the packed type.  But
to get the correct size we need ompi_ids in that array.  So this
turns .ptypes into a struct containing the original array
(relocated to .ptypes.bdt_count[]) and a new .ptypes.ompi_dt_count[].
Since types don't have to have an .ompi_id, there's also a flag to
indicate whether .ompi_dt_count[] has valid data or not.

convertor.master.ompi_remote_sizes[]:

The converter had a .remote_sizes[] that's indexed by the base opal
types.  But to specify sizes for external32 I added .ompi_remote_sizes[]
which is the same thing but indexed by ompi_id.  Converters don't
have to specify a set of .ompi_remote_sizes[] since opal types don't
even have to have .ompi_ids, so for initialization the same call that
sets up the .ompi_remote_sizes[] also sets .ompi_remote_sizes_is_set.
The opal sizes are used if the ompi sizes/IDs aren't set.

Signed-off-by: Mark Allen <markalle@us.ibm.com>
---
 ompi/datatype/ompi_datatype.h                 |   2 +-
 ompi/datatype/ompi_datatype_external32.c      |  62 +++
 ompi/datatype/ompi_datatype_get_elements.c    |   2 +-
 ompi/datatype/ompi_datatype_internal.h        | 366 ++++--------------
 ompi/datatype/ompi_datatype_module.c          |  51 +++
 opal/datatype/Makefile.am                     |   4 +-
 opal/datatype/opal_convertor.c                |  56 ++-
 opal/datatype/opal_convertor.h                |   5 +
 opal/datatype/opal_convertor_internal.h       |   9 +
 opal/datatype/opal_datatype.h                 |  41 +-
 opal/datatype/opal_datatype_add.c             |  27 +-
 opal/datatype/opal_datatype_dump.c            |   2 +-
 opal/datatype/opal_datatype_get_count.c       |  24 +-
 opal/datatype/opal_datatype_internal.h        | 213 +++++-----
 opal/datatype/opal_datatype_module.c          |  52 +--
 opal/datatype/opal_datatype_optimize.c        |  14 +-
 .../opal_datatype_pack_unpack_predefined.h    |  16 +-
 opal/datatype/opal_datatype_reset_predef.c    |  99 +++++
 opal/datatype/opal_datatype_setexternal32.c   |  32 ++
 test/datatype/ddt_pack.c                      |  12 +-
 20 files changed, 631 insertions(+), 458 deletions(-)
 create mode 100644 opal/datatype/opal_datatype_reset_predef.c
 create mode 100644 opal/datatype/opal_datatype_setexternal32.c

diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index e2ee2a79c01..ea7ed3bd56e 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -59,7 +59,7 @@ BEGIN_C_DECLS
 #define OMPI_DATATYPE_FLAG_DATA_FORTRAN  0xC000
 #define OMPI_DATATYPE_FLAG_DATA_LANGUAGE 0xC000
 
-#define OMPI_DATATYPE_MAX_PREDEFINED 50
+#define OMPI_DATATYPE_MAX_PREDEFINED 80
 
 #if OMPI_DATATYPE_MAX_PREDEFINED > OPAL_DATATYPE_MAX_SUPPORTED
 #error Need to increase the number of supported dataypes by OPAL (value OPAL_DATATYPE_MAX_SUPPORTED).
diff --git a/ompi/datatype/ompi_datatype_external32.c b/ompi/datatype/ompi_datatype_external32.c
index 108e14258b7..711821ca4ea 100644
--- a/ompi/datatype/ompi_datatype_external32.c
+++ b/ompi/datatype/ompi_datatype_external32.c
@@ -11,6 +11,7 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -23,6 +24,7 @@
 #include "opal/datatype/opal_convertor.h"
 #include "opal/util/arch.h"
 #include "ompi/datatype/ompi_datatype.h"
+#include "ompi/datatype/ompi_datatype_internal.h"
 
 /* From the MPI standard. external32 use the following types:
  *   Type Length
@@ -76,10 +78,13 @@ uint32_t ompi_datatype_external32_arch_id = OPAL_ARCH_LDEXPSIZEIS15 | OPAL_ARCH_
 opal_convertor_t* ompi_mpi_external32_convertor = NULL;
 opal_convertor_t* ompi_mpi_local_convertor = NULL;
 
+static void set_external32_sizes(opal_convertor_t *convertor);
+
 int32_t ompi_datatype_default_convertors_init( void )
 {
    /* create the extern32 convertor */
     ompi_mpi_external32_convertor = opal_convertor_create( ompi_datatype_external32_arch_id, 0 );
+    set_external32_sizes(ompi_mpi_external32_convertor);
 
     /* create the local convertor */
     ompi_mpi_local_convertor = opal_convertor_create( opal_local_arch, 0 );
@@ -95,3 +100,60 @@ int32_t ompi_datatype_default_convertors_fini( void )
 
     return OMPI_SUCCESS;
 }
+
+static void
+set_external32_sizes(opal_convertor_t *convertor)
+{
+    int i;
+
+    /* We have to give every predefined datatype a size, initialize as */
+    /* the default sizes, then change whatever is specified in the standard */
+    for (i=OMPI_DATATYPE_MPI_EMPTY+1; i<OMPI_DATATYPE_MPI_MAX_PREDEFINED; ++i) {
+        if (i != OMPI_DATATYPE_MPI_LB && i != OMPI_DATATYPE_MPI_UB) {
+            opal_convertor_set_ompi_remote_size(convertor,
+                i, ompi_datatype_basicDatatypes[i]->super.size);
+        }
+    }
+
+    opal_convertor_set_ompi_remote_size(convertor, MPI_PACKED->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_BYTE->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_CHAR->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_UNSIGNED_CHAR->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_SIGNED_CHAR->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_WCHAR->super.ompi_id, 2);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_SHORT->super.ompi_id, 2);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_UNSIGNED_SHORT->super.ompi_id, 2);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_INT->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_UNSIGNED->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_LONG->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_UNSIGNED_LONG->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_LONG_LONG_INT->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_UNSIGNED_LONG_LONG->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_FLOAT->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_DOUBLE->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_LONG_DOUBLE->super.ompi_id, 16);
+
+    opal_convertor_set_ompi_remote_size(convertor, MPI_C_BOOL->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_AINT->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_COUNT->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_OFFSET->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_C_COMPLEX->super.ompi_id, 2*4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_C_FLOAT_COMPLEX->super.ompi_id, 2*4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_C_DOUBLE_COMPLEX->super.ompi_id, 2*8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_C_LONG_DOUBLE_COMPLEX->super.ompi_id, 2*16);
+
+    opal_convertor_set_ompi_remote_size(convertor, MPI_CHARACTER->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_LOGICAL->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_INTEGER->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_REAL->super.ompi_id, 4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_DOUBLE_PRECISION->super.ompi_id, 8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_COMPLEX->super.ompi_id, 2*4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_DOUBLE_COMPLEX->super.ompi_id, 2*8);
+
+    opal_convertor_set_ompi_remote_size(convertor, MPI_CXX_BOOL->super.ompi_id, 1);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_CXX_FLOAT_COMPLEX->super.ompi_id, 2*4);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_CXX_DOUBLE_COMPLEX->super.ompi_id, 2*8);
+    opal_convertor_set_ompi_remote_size(convertor, MPI_CXX_LONG_DOUBLE_COMPLEX->super.ompi_id, 2*16);
+
+    opal_convertor_ompi_remote_size_is_ready(convertor);
+}
diff --git a/ompi/datatype/ompi_datatype_get_elements.c b/ompi/datatype/ompi_datatype_get_elements.c
index 72ac87d6df7..e9724f3b4d3 100644
--- a/ompi/datatype/ompi_datatype_get_elements.c
+++ b/ompi/datatype/ompi_datatype_get_elements.c
@@ -52,7 +52,7 @@ int ompi_datatype_get_elements (ompi_datatype_t *datatype, size_t ucount, size_t
             opal_datatype_compute_ptypes(&datatype->super);
             /* count the basic elements in the datatype */
             for (i = OPAL_DATATYPE_FIRST_TYPE, total = 0 ; i < OPAL_DATATYPE_MAX_PREDEFINED ; ++i) {
-                total += datatype->super.ptypes[i];
+                total += datatype->super.ptypes->bdt_count[i];
             }
             internal_count = total * internal_count;
         }
diff --git a/ompi/datatype/ompi_datatype_internal.h b/ompi/datatype/ompi_datatype_internal.h
index 0c175c4266c..d945f436009 100644
--- a/ompi/datatype/ompi_datatype_internal.h
+++ b/ompi/datatype/ompi_datatype_internal.h
@@ -10,6 +10,7 @@
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2016-2018 FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -108,306 +109,90 @@
 #define OMPI_DATATYPE_MPI_SHORT_FLOAT             0x30
 #define OMPI_DATATYPE_MPI_C_SHORT_FLOAT_COMPLEX   0x31
 
-/* This should __ALWAYS__ stay last  */
-#define OMPI_DATATYPE_MPI_UNAVAILABLE             0x32
+#define OMPI_DATATYPE_MPI_CHAR                    0x32
+#define OMPI_DATATYPE_MPI_SIGNED_CHAR             0x33
+#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR           0x34
+#define OMPI_DATATYPE_MPI_BYTE                    0x35
+#define OMPI_DATATYPE_MPI_SHORT                   0x36
+#define OMPI_DATATYPE_MPI_UNSIGNED_SHORT          0x37
+#define OMPI_DATATYPE_MPI_INT                     0x38
+#define OMPI_DATATYPE_MPI_UNSIGNED                0x39
+#define OMPI_DATATYPE_MPI_LONG                    0x3A
+#define OMPI_DATATYPE_MPI_UNSIGNED_LONG           0x3B
+#define OMPI_DATATYPE_MPI_LONG_LONG_INT           0x3C
+#define OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG      0x3D
+#define OMPI_DATATYPE_MPI_LOGICAL1                0x3E
+#define OMPI_DATATYPE_MPI_LOGICAL2                0x3F
+#define OMPI_DATATYPE_MPI_LOGICAL4                0x40
+#define OMPI_DATATYPE_MPI_LOGICAL8                0x41
+#define OMPI_DATATYPE_MPI_INTEGER1                0x42
+#define OMPI_DATATYPE_MPI_INTEGER2                0x43
+#define OMPI_DATATYPE_MPI_INTEGER4                0x44
+#define OMPI_DATATYPE_MPI_INTEGER8                0x45
+#define OMPI_DATATYPE_MPI_INTEGER16               0x46
+#define OMPI_DATATYPE_MPI_REAL2                   0x47
+#define OMPI_DATATYPE_MPI_REAL4                   0x48
+#define OMPI_DATATYPE_MPI_REAL8                   0x49
+#define OMPI_DATATYPE_MPI_REAL16                  0x4A
+
+#define OMPI_DATATYPE_MPI_CXX_BOOL                0x4B
+#define OMPI_DATATYPE_MPI_CXX_SHORT_FLOAT_COMPLEX 0x4C
+#define OMPI_DATATYPE_MPI_CXX_FLOAT_COMPLEX       0x4D
+#define OMPI_DATATYPE_MPI_CXX_DOUBLE_COMPLEX      0x4E
+#define OMPI_DATATYPE_MPI_CXX_LONG_DOUBLE_COMPLEX 0x4F
 
+/* This should __ALWAYS__ stay last  */
+#define OMPI_DATATYPE_MPI_UNAVAILABLE             0x50
 
 #define OMPI_DATATYPE_MPI_MAX_PREDEFINED          (OMPI_DATATYPE_MPI_UNAVAILABLE+1)
 
 /*
- * Ensure we can support the predefined datatypes.
- */
-#if OMPI_DATATYPE_MAX_PREDEFINED < OMPI_DATATYPE_MPI_UNAVAILABLE
-#error OMPI_DATATYPE_MAX_PREDEFINED should be updated to the value of OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-/*
- * Mapped types. The following types have basic equivalents in OPAL. Instead
- * of being redefined as independent types, they will be made synonyms to
- * the most basic type.
+ *  Note all the mapping is removed above, for a couple reasons.
+ *
+ *  The first group of mapping that used to be here was a lot of
+ *    #define OMPI_DATATYPE_MPI_LONG OMPI_DATATYPE_MPI_INT64_T
+ *  for example.  But we need to have .ompi_id be unique enough
+ *  that external32 can specify a size for MPI_LONG that differs
+ *  from MPI_INT64_T so that had to change.
+ *
+ *  The second group was similar but in the other direction, there
+ *  were a few like
+ *    #define OMPI_DATATYPE_MPI_REAL4 OMPI_DATATYPE_MPI_REAL
+ *  which is a problem for the same reason.
+ *
+ *  The last group posed a more implementation-specific problem.
+ *  There used to be a few
+ *    #define OMPI_DATATYPE_MPI_CXX_DOUBLE_COMPLEX \
+ *            OMPI_DATATYPE_MPI_C_DOUBLE_COMPLEX
+ *  which I can't imagine being a problem, except for the way
+ *  a datatype's .desc[] field is initialized and modified.
+ *  These are two OMPI types for example created as
+ *      ompi_mpi_c_double_complex = ...PREDEFINED(C_DOUBLE_COMPLEX,)
+ *      ompi_mpi_cxx_dblcplex = ...BASIC_TYPE(,CXX_FLOAT_COMPLEX)
+ *  where they both go through the same OPAL dt initialization
+ *  and initially both contain a .desc pointing to the same offset
+ *  into opal_datatype_predefined_elem_desc[].  I have a function
+ *  later in init that re-allocates .desc[] for any type in the
+ *  ompi_datatype_basicDatatypes[] array, but if we have the
+ *  above mapping then only ompi_mpi_c_double_complex is in the
+ *  array, not ompi_mpi_cxx_dblcplex so the mapped type wouldn't
+ *  get its .desc fixed unless we put it in a second list
+ *  somewhere.
  */
-#if SIZEOF_CHAR == 1
-#define OMPI_DATATYPE_MPI_CHAR                    OMPI_DATATYPE_MPI_INT8_T
-#define OMPI_DATATYPE_MPI_SIGNED_CHAR             OMPI_DATATYPE_MPI_INT8_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR           OMPI_DATATYPE_MPI_UINT8_T
-#define OMPI_DATATYPE_MPI_BYTE                    OMPI_DATATYPE_MPI_UINT8_T
-#elif SIZEOF_CHAR == 2
-#define OMPI_DATATYPE_MPI_CHAR                    OMPI_DATATYPE_MPI_INT16_T
-#define OMPI_DATATYPE_MPI_SIGNED_CHAR             OMPI_DATATYPE_MPI_INT16_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR           OMPI_DATATYPE_MPI_UINT16_T
-#define OMPI_DATATYPE_MPI_BYTE                    OMPI_DATATYPE_MPI_UINT16_T
-#elif SIZEOF_CHAR == 4
-#define OMPI_DATATYPE_MPI_CHAR                    OMPI_DATATYPE_MPI_INT32_T
-#define OMPI_DATATYPE_MPI_SIGNED_CHAR             OMPI_DATATYPE_MPI_INT32_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR           OMPI_DATATYPE_MPI_UINT32_T
-#define OMPI_DATATYPE_MPI_BYTE                    OMPI_DATATYPE_MPI_UINT32_T
-#elif SIZEOF_CHAR == 8
-#define OMPI_DATATYPE_MPI_CHAR                    OMPI_DATATYPE_MPI_INT64_T
-#define OMPI_DATATYPE_MPI_SIGNED_CHAR             OMPI_DATATYPE_MPI_INT64_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR           OMPI_DATATYPE_MPI_UINT64_T
-#define OMPI_DATATYPE_MPI_BYTE                    OMPI_DATATYPE_MPI_UINT64_T
-#endif
-
-#if SIZEOF_SHORT == 1
-#define OMPI_DATATYPE_MPI_SHORT                   OMPI_DATATYPE_MPI_INT8_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_SHORT          OMPI_DATATYPE_MPI_UINT8_T
-#elif SIZEOF_SHORT == 2
-#define OMPI_DATATYPE_MPI_SHORT                   OMPI_DATATYPE_MPI_INT16_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_SHORT          OMPI_DATATYPE_MPI_UINT16_T
-#elif SIZEOF_SHORT == 4
-#define OMPI_DATATYPE_MPI_SHORT                   OMPI_DATATYPE_MPI_INT32_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_SHORT          OMPI_DATATYPE_MPI_UINT32_T
-#elif SIZEOF_SHORT == 8
-#define OMPI_DATATYPE_MPI_SHORT                   OMPI_DATATYPE_MPI_INT64_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_SHORT          OMPI_DATATYPE_MPI_UINT64_T
-#endif
-
-#if SIZEOF_INT == 1
-#define OMPI_DATATYPE_MPI_INT                     OMPI_DATATYPE_MPI_INT8_T
-#define OMPI_DATATYPE_MPI_UNSIGNED                OMPI_DATATYPE_MPI_UINT8_T
-#elif SIZEOF_INT == 2
-#define OMPI_DATATYPE_MPI_INT                     OMPI_DATATYPE_MPI_INT16_T
-#define OMPI_DATATYPE_MPI_UNSIGNED                OMPI_DATATYPE_MPI_UINT16_T
-#elif SIZEOF_INT == 4
-#define OMPI_DATATYPE_MPI_INT                     OMPI_DATATYPE_MPI_INT32_T
-#define OMPI_DATATYPE_MPI_UNSIGNED                OMPI_DATATYPE_MPI_UINT32_T
-#elif SIZEOF_INT == 8
-#define OMPI_DATATYPE_MPI_INT                     OMPI_DATATYPE_MPI_INT64_T
-#define OMPI_DATATYPE_MPI_UNSIGNED                OMPI_DATATYPE_MPI_UINT64_T
-#endif
-
-#if SIZEOF_LONG == 1
-#define OMPI_DATATYPE_MPI_LONG                    OMPI_DATATYPE_MPI_INT8_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG           OMPI_DATATYPE_MPI_UINT8_T
-#elif SIZEOF_LONG == 2
-#define OMPI_DATATYPE_MPI_LONG                    OMPI_DATATYPE_MPI_INT16_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG           OMPI_DATATYPE_MPI_UINT16_T
-#elif SIZEOF_LONG == 4
-#define OMPI_DATATYPE_MPI_LONG                    OMPI_DATATYPE_MPI_INT32_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG           OMPI_DATATYPE_MPI_UINT32_T
-#elif SIZEOF_LONG == 8
-#define OMPI_DATATYPE_MPI_LONG                    OMPI_DATATYPE_MPI_INT64_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG           OMPI_DATATYPE_MPI_UINT64_T
+#if OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY != OMPI_DATATYPE_MPI_EMPTY
+#error OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY needs to mirror the value of OMPI_DATATYPE_MPI_EMPTY
 #endif
-
-#if SIZEOF_LONG_LONG == 1
-#define OMPI_DATATYPE_MPI_LONG_LONG_INT           OMPI_DATATYPE_MPI_INT8_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG      OMPI_DATATYPE_MPI_UINT8_T
-#elif SIZEOF_LONG_LONG == 2
-#define OMPI_DATATYPE_MPI_LONG_LONG_INT           OMPI_DATATYPE_MPI_INT16_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG      OMPI_DATATYPE_MPI_UINT16_T
-#elif SIZEOF_LONG_LONG == 4
-#define OMPI_DATATYPE_MPI_LONG_LONG_INT           OMPI_DATATYPE_MPI_INT32_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG      OMPI_DATATYPE_MPI_UINT32_T
-#elif SIZEOF_LONG_LONG == 8
-#define OMPI_DATATYPE_MPI_LONG_LONG_INT           OMPI_DATATYPE_MPI_INT64_T
-#define OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG      OMPI_DATATYPE_MPI_UINT64_T
+#if OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED != OMPI_DATATYPE_MPI_MAX_PREDEFINED
+#error  OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED needs to mirror the value of OMPI_DATATYPE_MPI_MAX_PREDEFINED
 #endif
 
 /*
- * Optional Fortran datatypes, these map to representable types
- * in the lower layer, aka as other Fortran types have to map to C types,
- * additionally, if the type has the same size as the mandatory
- * Fortran type, map to this one.
+ * Ensure we can support the predefined datatypes.
  */
-/* LOGICAL */
-#if OMPI_SIZEOF_FORTRAN_LOGICAL1 == OMPI_SIZEOF_FORTRAN_LOGICAL
-#  define OMPI_DATATYPE_MPI_LOGICAL1              OMPI_DATATYPE_MPI_LOGICAL
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL1 == 1
-#  define OMPI_DATATYPE_MPI_LOGICAL1              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL1 == 2
-#  define OMPI_DATATYPE_MPI_LOGICAL1              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL1 == 4
-#  define OMPI_DATATYPE_MPI_LOGICAL1              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL1 == 8
-#  define OMPI_DATATYPE_MPI_LOGICAL1              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_LOGICAL1              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_LOGICAL2 == OMPI_SIZEOF_FORTRAN_LOGICAL
-#  define OMPI_DATATYPE_MPI_LOGICAL2              OMPI_DATATYPE_MPI_LOGICAL
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL2 == 1
-#  define OMPI_DATATYPE_MPI_LOGICAL2              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL2 == 2
-#  define OMPI_DATATYPE_MPI_LOGICAL2              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL2 == 4
-#  define OMPI_DATATYPE_MPI_LOGICAL2              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL2 == 8
-#  define OMPI_DATATYPE_MPI_LOGICAL2              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_LOGICAL2              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_LOGICAL4 == OMPI_SIZEOF_FORTRAN_LOGICAL
-#  define OMPI_DATATYPE_MPI_LOGICAL4              OMPI_DATATYPE_MPI_LOGICAL
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL4 == 1
-#  define OMPI_DATATYPE_MPI_LOGICAL4              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL4 == 2
-#  define OMPI_DATATYPE_MPI_LOGICAL4              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL4 == 4
-#  define OMPI_DATATYPE_MPI_LOGICAL4              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL4 == 8
-#  define OMPI_DATATYPE_MPI_LOGICAL4              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_LOGICAL4              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_LOGICAL8 == OMPI_SIZEOF_FORTRAN_LOGICAL
-#  define OMPI_DATATYPE_MPI_LOGICAL8              OMPI_DATATYPE_MPI_LOGICAL
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL8 == 1
-#  define OMPI_DATATYPE_MPI_LOGICAL8              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL8 == 2
-#  define OMPI_DATATYPE_MPI_LOGICAL8              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL8 == 4
-#  define OMPI_DATATYPE_MPI_LOGICAL8              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_LOGICAL8 == 8
-#  define OMPI_DATATYPE_MPI_LOGICAL8              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_LOGICAL8              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-/* INTEGER */
-#if OMPI_SIZEOF_FORTRAN_INTEGER1 == OMPI_SIZEOF_FORTRAN_INTEGER
-#  define OMPI_DATATYPE_MPI_INTEGER1              OMPI_DATATYPE_MPI_INTEGER
-#elif OMPI_SIZEOF_FORTRAN_INTEGER1 == 1
-#  define OMPI_DATATYPE_MPI_INTEGER1              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER1 == 2
-#  define OMPI_DATATYPE_MPI_INTEGER1              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER1 == 4
-#  define OMPI_DATATYPE_MPI_INTEGER1              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER1 == 8
-#  define OMPI_DATATYPE_MPI_INTEGER1              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_INTEGER1              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_INTEGER2 == OMPI_SIZEOF_FORTRAN_INTEGER
-#  define OMPI_DATATYPE_MPI_INTEGER2              OMPI_DATATYPE_MPI_INTEGER
-#elif OMPI_SIZEOF_FORTRAN_INTEGER2 == 1
-#  define OMPI_DATATYPE_MPI_INTEGER2              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER2 == 2
-#  define OMPI_DATATYPE_MPI_INTEGER2              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER2 == 4
-#  define OMPI_DATATYPE_MPI_INTEGER2              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER2 == 8
-#  define OMPI_DATATYPE_MPI_INTEGER2              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_INTEGER2              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_INTEGER4 == OMPI_SIZEOF_FORTRAN_INTEGER
-#  define OMPI_DATATYPE_MPI_INTEGER4              OMPI_DATATYPE_MPI_INTEGER
-#elif OMPI_SIZEOF_FORTRAN_INTEGER4 == 1
-#  define OMPI_DATATYPE_MPI_INTEGER4              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER4 == 2
-#  define OMPI_DATATYPE_MPI_INTEGER4              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER4 == 4
-#  define OMPI_DATATYPE_MPI_INTEGER4              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER4 == 8
-#  define OMPI_DATATYPE_MPI_INTEGER4              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_INTEGER4              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_INTEGER8 == OMPI_SIZEOF_FORTRAN_INTEGER
-#  define OMPI_DATATYPE_MPI_INTEGER8              OMPI_DATATYPE_MPI_INTEGER
-#elif OMPI_SIZEOF_FORTRAN_INTEGER8 == 1
-#  define OMPI_DATATYPE_MPI_INTEGER8              OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER8 == 2
-#  define OMPI_DATATYPE_MPI_INTEGER8              OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER8 == 4
-#  define OMPI_DATATYPE_MPI_INTEGER8              OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER8 == 8
-#  define OMPI_DATATYPE_MPI_INTEGER8              OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_INTEGER8              OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_INTEGER16 == OMPI_SIZEOF_FORTRAN_INTEGER
-#  define OMPI_DATATYPE_MPI_INTEGER16             OMPI_DATATYPE_MPI_INTEGER
-#elif OMPI_SIZEOF_FORTRAN_INTEGER16 == 1
-#  define OMPI_DATATYPE_MPI_INTEGER16             OMPI_DATATYPE_MPI_INT8_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER16 == 2
-#  define OMPI_DATATYPE_MPI_INTEGER16             OMPI_DATATYPE_MPI_INT16_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER16 == 4
-#  define OMPI_DATATYPE_MPI_INTEGER16             OMPI_DATATYPE_MPI_INT32_T
-#elif OMPI_SIZEOF_FORTRAN_INTEGER16 == 8
-#  define OMPI_DATATYPE_MPI_INTEGER16             OMPI_DATATYPE_MPI_INT64_T
-#else
-#  define OMPI_DATATYPE_MPI_INTEGER16             OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-/* REAL */
-#if OMPI_SIZEOF_FORTRAN_REAL2 == OMPI_SIZEOF_FORTRAN_REAL
-#  define OMPI_DATATYPE_MPI_REAL2                 OMPI_DATATYPE_MPI_REAL
-#elif (defined(HAVE_SHORT_FLOAT) && OMPI_SIZEOF_FORTRAN_REAL2 == SIZEOF_SHORT_FLOAT) || \
-      (defined(HAVE_OPAL_SHORT_FLOAT_T) && OMPI_SIZEOF_FORTRAN_REAL2 == SIZEOF_OPAL_SHORT_FLOAT_T)
-#  define OMPI_DATATYPE_MPI_REAL2                 OMPI_DATATYPE_MPI_SHORT_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL2 == SIZEOF_FLOAT
-#  define OMPI_DATATYPE_MPI_REAL2                 OMPI_DATATYPE_MPI_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL2 == SIZEOF_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL2                 OMPI_DATATYPE_MPI_DOUBLE
-#elif OMPI_SIZEOF_FORTRAN_REAL2 == SIZEOF_LONG_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL2                 OMPI_DATATYPE_MPI_LONG_DOUBLE
-#else
-#  define OMPI_DATATYPE_MPI_REAL2                 OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_REAL4 == OMPI_SIZEOF_FORTRAN_REAL
-#  define OMPI_DATATYPE_MPI_REAL4                 OMPI_DATATYPE_MPI_REAL
-#elif (defined(HAVE_SHORT_FLOAT) && OMPI_SIZEOF_FORTRAN_REAL4 == SIZEOF_SHORT_FLOAT) || \
-      (defined(HAVE_OPAL_SHORT_FLOAT_T) && OMPI_SIZEOF_FORTRAN_REAL4 == SIZEOF_OPAL_SHORT_FLOAT_T)
-#  define OMPI_DATATYPE_MPI_REAL4                 OMPI_DATATYPE_MPI_SHORT_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL4 == SIZEOF_FLOAT
-#  define OMPI_DATATYPE_MPI_REAL4                 OMPI_DATATYPE_MPI_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL4 == SIZEOF_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL4                 OMPI_DATATYPE_MPI_DOUBLE
-#elif OMPI_SIZEOF_FORTRAN_REAL4 == SIZEOF_LONG_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL4                 OMPI_DATATYPE_MPI_LONG_DOUBLE
-#else
-#  define OMPI_DATATYPE_MPI_REAL4                 OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_REAL8 == OMPI_SIZEOF_FORTRAN_REAL
-#  define OMPI_DATATYPE_MPI_REAL8                 OMPI_DATATYPE_MPI_REAL
-#elif (defined(HAVE_SHORT_FLOAT) && OMPI_SIZEOF_FORTRAN_REAL8 == SIZEOF_SHORT_FLOAT) || \
-      (defined(HAVE_OPAL_SHORT_FLOAT_T) && OMPI_SIZEOF_FORTRAN_REAL8 == SIZEOF_OPAL_SHORT_FLOAT_T)
-#  define OMPI_DATATYPE_MPI_REAL8                 OMPI_DATATYPE_MPI_SHORT_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL8 == SIZEOF_FLOAT
-#  define OMPI_DATATYPE_MPI_REAL8                 OMPI_DATATYPE_MPI_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL8 == SIZEOF_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL8                 OMPI_DATATYPE_MPI_DOUBLE
-#elif OMPI_SIZEOF_FORTRAN_REAL8 == SIZEOF_LONG_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL8                 OMPI_DATATYPE_MPI_LONG_DOUBLE
-#else
-#  define OMPI_DATATYPE_MPI_REAL8                 OMPI_DATATYPE_MPI_UNAVAILABLE
-#endif
-
-#if OMPI_SIZEOF_FORTRAN_REAL16 == OMPI_SIZEOF_FORTRAN_REAL
-#  define OMPI_DATATYPE_MPI_REAL16                OMPI_DATATYPE_MPI_REAL
-#elif (defined(HAVE_SHORT_FLOAT) && OMPI_SIZEOF_FORTRAN_REAL16 == SIZEOF_SHORT_FLOAT) || \
-      (defined(HAVE_OPAL_SHORT_FLOAT_T) && OMPI_SIZEOF_FORTRAN_REAL16 == SIZEOF_OPAL_SHORT_FLOAT_T)
-#  define OMPI_DATATYPE_MPI_REAL16                 OMPI_DATATYPE_MPI_SHORT_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL16 == SIZEOF_FLOAT
-#  define OMPI_DATATYPE_MPI_REAL16                OMPI_DATATYPE_MPI_FLOAT
-#elif OMPI_SIZEOF_FORTRAN_REAL16 == SIZEOF_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL16                OMPI_DATATYPE_MPI_DOUBLE
-#elif OMPI_SIZEOF_FORTRAN_REAL16 == SIZEOF_LONG_DOUBLE
-#  define OMPI_DATATYPE_MPI_REAL16                OMPI_DATATYPE_MPI_LONG_DOUBLE
-#else
-#  define OMPI_DATATYPE_MPI_REAL16                OMPI_DATATYPE_MPI_UNAVAILABLE
+#if OMPI_DATATYPE_MAX_PREDEFINED < OMPI_DATATYPE_MPI_UNAVAILABLE
+#error OMPI_DATATYPE_MAX_PREDEFINED should be updated to the value of OMPI_DATATYPE_MPI_UNAVAILABLE
 #endif
 
-/*
- * C++ datatypes, these map to C datatypes.
- */
-#define OMPI_DATATYPE_MPI_CXX_BOOL                OMPI_DATATYPE_MPI_C_BOOL
-#define OMPI_DATATYPE_MPI_CXX_SHORT_FLOAT_COMPLEX OMPI_DATATYPE_MPI_C_SHORT_FLOAT_COMPLEX
-#define OMPI_DATATYPE_MPI_CXX_FLOAT_COMPLEX       OMPI_DATATYPE_MPI_C_FLOAT_COMPLEX
-#define OMPI_DATATYPE_MPI_CXX_DOUBLE_COMPLEX      OMPI_DATATYPE_MPI_C_DOUBLE_COMPLEX
-#define OMPI_DATATYPE_MPI_CXX_LONG_DOUBLE_COMPLEX OMPI_DATATYPE_MPI_C_LONG_DOUBLE_COMPLEX
-
 extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX_PREDEFINED];
 
 /* There 3 types of predefined data types.
@@ -431,7 +216,7 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX
     .packed_description = 0,                                                         \
     .name = "MPI_" # NAME
 
-#define OMPI_DATATYPE_INITIALIZER_UNAVAILABLE(FLAGS)                                 \
+#define OMPI_DATATYPE_INITIALIZER_UNAVAILABLE(FLAGS, OMPI_ID)                        \
     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE(FLAGS)
 
 #define OMPI_DATATYPE_INIT_PREDEFINED_BASIC_TYPE_X( TYPE, NAME, FLAGS )              \
@@ -440,7 +225,8 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX
             OMPI_DATATYPE_INITIALIZER_ ## TYPE (OMPI_DATATYPE_FLAG_PREDEFINED |      \
                                                 OMPI_DATATYPE_FLAG_ANALYZED   |      \
                                                 OMPI_DATATYPE_FLAG_MONOTONIC  |      \
-                                                (FLAGS)) /*super*/,                  \
+                                                (FLAGS), /*super*/                   \
+                                                OMPI_DATATYPE_MPI_ ## NAME),         \
             OMPI_DATATYPE_EMPTY_DATA(NAME) /*id,d_f_to_c_index,d_keyhash,args,packed_description,name*/ \
         },                                                                           \
         {0, } /* padding */                                                          \
@@ -493,7 +279,7 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX
         .name = OPAL_DATATYPE_INIT_NAME(TYPE ## SIZE),                               \
         .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(TYPE ## SIZE),                    \
         .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(TYPE ## SIZE),                \
-        .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY(TYPE ## SIZE)                      \
+        .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY(TYPE ## SIZE, SIZE, NAME)          \
     }
 
 #define OMPI_DATATYPE_INIT_PREDEFINED_BASIC_TYPE_FORTRAN( TYPE, NAME, SIZE, ALIGN, FLAGS ) \
diff --git a/ompi/datatype/ompi_datatype_module.c b/ompi/datatype/ompi_datatype_module.c
index 1fbf35aba3b..273eebfdb18 100644
--- a/ompi/datatype/ompi_datatype_module.c
+++ b/ompi/datatype/ompi_datatype_module.c
@@ -375,6 +375,37 @@ const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX_PREDEF
     [OMPI_DATATYPE_MPI_SHORT_FLOAT] = &ompi_mpi_short_float.dt,
     [OMPI_DATATYPE_MPI_C_SHORT_FLOAT_COMPLEX] = &ompi_mpi_c_short_float_complex.dt,
 
+    [OMPI_DATATYPE_MPI_CHAR] = &ompi_mpi_char.dt,
+    [OMPI_DATATYPE_MPI_SIGNED_CHAR] = &ompi_mpi_signed_char.dt,
+    [OMPI_DATATYPE_MPI_UNSIGNED_CHAR] = &ompi_mpi_unsigned_char.dt,
+    [OMPI_DATATYPE_MPI_BYTE] = &ompi_mpi_byte.dt,
+    [OMPI_DATATYPE_MPI_SHORT] = &ompi_mpi_short.dt,
+    [OMPI_DATATYPE_MPI_UNSIGNED_SHORT] = &ompi_mpi_unsigned_short.dt,
+    [OMPI_DATATYPE_MPI_INT] = &ompi_mpi_int.dt,
+    [OMPI_DATATYPE_MPI_UNSIGNED] = &ompi_mpi_unsigned.dt,
+    [OMPI_DATATYPE_MPI_LONG] = &ompi_mpi_long.dt,
+    [OMPI_DATATYPE_MPI_UNSIGNED_LONG] = &ompi_mpi_unsigned_long.dt,
+    [OMPI_DATATYPE_MPI_LONG_LONG_INT] = &ompi_mpi_long_long_int.dt,
+    [OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG] = &ompi_mpi_unsigned_long_long.dt,
+    [OMPI_DATATYPE_MPI_LOGICAL1] = &ompi_mpi_logical1.dt,
+    [OMPI_DATATYPE_MPI_LOGICAL2] = &ompi_mpi_logical2.dt,
+    [OMPI_DATATYPE_MPI_LOGICAL4] = &ompi_mpi_logical4.dt,
+    [OMPI_DATATYPE_MPI_LOGICAL8] = &ompi_mpi_logical8.dt,
+    [OMPI_DATATYPE_MPI_INTEGER1] = &ompi_mpi_integer1.dt,
+    [OMPI_DATATYPE_MPI_INTEGER2] = &ompi_mpi_integer2.dt,
+    [OMPI_DATATYPE_MPI_INTEGER4] = &ompi_mpi_integer4.dt,
+    [OMPI_DATATYPE_MPI_INTEGER8] = &ompi_mpi_integer8.dt,
+    [OMPI_DATATYPE_MPI_INTEGER16] = &ompi_mpi_integer16.dt,
+    [OMPI_DATATYPE_MPI_REAL2] = &ompi_mpi_real2.dt,
+    [OMPI_DATATYPE_MPI_REAL4] = &ompi_mpi_real4.dt,
+    [OMPI_DATATYPE_MPI_REAL8] = &ompi_mpi_real8.dt,
+    [OMPI_DATATYPE_MPI_REAL16] = &ompi_mpi_real16.dt,
+    [OMPI_DATATYPE_MPI_CXX_BOOL] = &ompi_mpi_cxx_bool.dt,
+    [OMPI_DATATYPE_MPI_CXX_SHORT_FLOAT_COMPLEX] = &ompi_mpi_cxx_sfltcplex.dt,
+    [OMPI_DATATYPE_MPI_CXX_FLOAT_COMPLEX] = &ompi_mpi_cxx_cplex.dt,
+    [OMPI_DATATYPE_MPI_CXX_DOUBLE_COMPLEX] = &ompi_mpi_cxx_dblcplex.dt,
+    [OMPI_DATATYPE_MPI_CXX_LONG_DOUBLE_COMPLEX] = &ompi_mpi_cxx_ldblcplex.dt,
+
     [OMPI_DATATYPE_MPI_UNAVAILABLE] = &ompi_mpi_unavailable.dt,
 };
 
@@ -663,12 +694,32 @@ int32_t ompi_datatype_init( void )
         }
     }
     ompi_datatype_default_convertors_init();
+
+/*
+ *  Many of the predefined OMPI datatypes use OPAL initializers that map
+ *  their .desc entries into a shared space where it's impossible to set an
+ *  .ompi_id due to collision.  This call privatizes each that's constructed
+ *  that way (it checks if the incoming .desc[] is in the shared range).
+ *  Note some of the predefineds are already fine, eg the ones that use
+ *  OMPI_DATATYPE_INIT_DEFER() but I don't see any good way to identify which
+ *  are okay vs not other than by checking each predefined datatype.
+ */
+    for (i=OMPI_DATATYPE_MPI_EMPTY+1; i<OMPI_DATATYPE_MPI_MAX_PREDEFINED; ++i) {
+        opal_datatype_desc_update_reset((opal_datatype_t*)&ompi_datatype_basicDatatypes[i]->super);
+    }
+
     return OMPI_SUCCESS;
 }
 
 
 int32_t ompi_datatype_finalize( void )
 {
+    int i;
+
+    for (i=OMPI_DATATYPE_MPI_EMPTY+1; i<OMPI_DATATYPE_MPI_MAX_PREDEFINED; ++i) {
+        opal_datatype_desc_update_free((opal_datatype_t*)&ompi_datatype_basicDatatypes[i]->super);
+    }
+
     /* As the synonyms are just copies of the internal data we should not free them.
      * Anyway they are over the limit of OMPI_DATATYPE_MPI_MAX_PREDEFINED so they will never get freed.
      */
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 36d13eff3b5..35bbaf6893f 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -68,7 +68,9 @@ libdatatype_la_SOURCES = \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
-        opal_datatype_unpack.c
+        opal_datatype_unpack.c \
+        opal_datatype_setexternal32.c \
+        opal_datatype_reset_predef.c
 
 libdatatype_la_LIBADD = libdatatype_reliable.la
 
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 853e5b1632f..ae2d0ac31a3 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -119,6 +119,12 @@ opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_a
      */
     remote_sizes = (size_t*)master->remote_sizes;
     memcpy(remote_sizes, opal_datatype_local_sizes, sizeof(size_t) * OPAL_DATATYPE_MAX_PREDEFINED);
+
+    /*
+     * Specify that ompi_remote_sizes[] isn't initally set up
+     */
+    master->ompi_remote_sizes_is_set = 0;
+
     /**
      * If the local and remote architecture are the same there is no need
      * to check for the remote data sizes. They will always be the same as
@@ -448,13 +454,36 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
 
 static size_t
 opal_datatype_compute_remote_size( const opal_datatype_t* pData,
-                                   const size_t* sizes )
+                                   const size_t* remote_sizes,
+                                   const size_t* ompi_remote_sizes,
+                                   int ompi_remote_sizes_is_set )
 {
     uint32_t typeMask = pData->bdt_used;
     size_t length = 0;
 
     if (opal_datatype_is_predefined(pData)) {
-        return sizes[pData->desc.desc->elem.common.type];
+/*
+ *  opal datatypes can exist independently of OMPI datatypes,
+ *  or they can be connected to an OMPI datatype, so .ompi_id
+ *  can be set but doesn't have to be.
+ *  Similarly convertors can set an ompi_remote_sizes[] array
+ *  but don't have to.
+ *  If either of those isn't available then remote_sizes[opal_id]
+ *  is the way to get the size of a predefined opal datatype.
+ *  But if an ompi_id is available and if the convertor has
+ *  specified a set of ompi_remote_sizes[] then those allow
+ *  a more specific size to be given.
+ *
+ *  Example: MPI_LONG and MPI_INT8_T both have .id OPAL_DATATYPE_INT8
+ *  so the remote_sizes[.id] would be 8.  But external32 wants to
+ *  see the size for the MPI_LONG as 4.  This requires the .ompi_id
+ *  to distinguish.
+ */
+        int ompi_id = pData->desc.desc->elem.common.ompi_id;
+        if (ompi_id != OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY && ompi_remote_sizes_is_set) {
+            return ompi_remote_sizes[ompi_id];
+        }
+        return remote_sizes[pData->desc.desc->elem.common.type];
     }
 
     if( OPAL_UNLIKELY(NULL == pData->ptypes) ) {
@@ -462,10 +491,20 @@ opal_datatype_compute_remote_size( const opal_datatype_t* pData,
         opal_datatype_compute_ptypes( (opal_datatype_t*)pData );
     }
 
-    for( int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) {
-        if( typeMask & ((uint32_t)1 << i) ) {
-            length += (pData->ptypes[i] * sizes[i]);
-            typeMask ^= ((uint32_t)1 << i);
+/*
+ *  For walking the ptypes entries, walk the .ompi_dt_count[] if available,
+ *  or the .bdt_count[] if the ompi data is unavailable.
+ */
+    if (pData->ptypes->ompi_dt_count_is_valid && ompi_remote_sizes_is_set) {
+        for( int i = OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY+1; i < OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED; i++ ) {
+            length += (pData->ptypes->ompi_dt_count[i] * ompi_remote_sizes[i]);
+        }
+    } else {
+        for( int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) {
+            if( typeMask & ((uint32_t)1 << i) ) {
+                length += (pData->ptypes->bdt_count[i] * remote_sizes[i]);
+                typeMask ^= ((uint32_t)1 << i);
+            }
         }
     }
     return length;
@@ -489,7 +528,9 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
         if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
             /* This is for a single datatype, we must update it with the count */
             pConvertor->remote_size = opal_datatype_compute_remote_size(datatype,
-                                                                        pConvertor->master->remote_sizes);
+                                                                        pConvertor->master->remote_sizes,
+                                                                        pConvertor->master->ompi_remote_sizes,
+                                                                        pConvertor->master->ompi_remote_sizes_is_set);
             pConvertor->remote_size *= pConvertor->count;
         }
     }
@@ -749,3 +790,4 @@ void opal_datatype_dump_stack( const dt_stack_t* pStack, int stack_pos,
     }
     opal_output( 0, "\n" );
 }
+
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index b24d94c37b0..66cca243710 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -397,6 +397,11 @@ OPAL_DECLSPEC int
 opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
                                         size_t* position );
 
+OPAL_DECLSPEC void
+opal_convertor_set_ompi_remote_size(opal_convertor_t *convertor, int idx, int size);
+OPAL_DECLSPEC void
+opal_convertor_ompi_remote_size_is_ready(opal_convertor_t *convertor);
+
 END_C_DECLS
 
 #endif  /* OPAL_CONVERTOR_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/opal_convertor_internal.h b/opal/datatype/opal_convertor_internal.h
index 39690f5bd19..94622060743 100644
--- a/opal/datatype/opal_convertor_internal.h
+++ b/opal/datatype/opal_convertor_internal.h
@@ -33,6 +33,15 @@ typedef struct opal_convertor_master_t {
     uint32_t                        flags;
     uint32_t                        hetero_mask;
     const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
+    size_t                          ompi_remote_sizes[OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED];
+    int                             ompi_remote_sizes_is_set;
+                            /*
+                             *  opal datatypes don't have to be associated with
+                             *  an ompi datatype, so remote_sizes[opal_id]
+                             *  is necessary to keep, but if there is an
+                             *  ompi_id, it's more specific and should be
+                             *  used with ompi_remote_sizes[].
+                             */
     conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
 } opal_convertor_master_t;
 
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 8f054f7c7b0..62a06e1e433 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -61,8 +61,16 @@ BEGIN_C_DECLS
  *
  * BEWARE: This constant should reflect whatever the OMPI-layer needs.
  */
-#define OPAL_DATATYPE_MAX_SUPPORTED  50
+#define OPAL_DATATYPE_MAX_SUPPORTED  80
 
+/*
+ *  These need to match OMPI_DATATYPE_MPI_EMPTY and
+ *  OMPI_DATATYPE_MPI_MAX_PREDEFINED which we don't have
+ *  visibilit into over here in opal.  So these values
+ *  will be checked for consistency in ompi_datatype_internal.h
+ */
+#define OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY          0
+#define OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED 81
 
 /* flags for the datatypes. */
 #define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
@@ -74,6 +82,7 @@ BEGIN_C_DECLS
 #define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
 #define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
 #define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
+#define OPAL_DATATYPE_FLAG_DESC_WAS_REALLOCATED 0x0200 /* predefined DT can have .desc modified */
 /*
  * We should make the difference here between the predefined contiguous and non contiguous
  * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
@@ -100,6 +109,18 @@ struct dt_type_desc_t {
 };
 typedef struct dt_type_desc_t dt_type_desc_t;
 
+/*
+ *  ptypes structure
+ *
+ *  There used to just be a ptypes[] array that was a count of how
+ *  many of each base OPAL_DATATYPE_ was in a given datatype.  This
+ *  keeps that but adds another field for using ompi IDs
+ */
+typedef struct {
+    size_t bdt_count[OPAL_DATATYPE_MAX_SUPPORTED];
+    size_t ompi_dt_count[OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED];
+    int ompi_dt_count_is_valid;
+} ptypes_t;
 
 /*
  * The datatype description.
@@ -126,16 +147,14 @@ struct opal_datatype_t {
     dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
                                       or in the send case (without conversion) */
 
-    size_t             *ptypes;  /**< array of basic predefined types that facilitate the computing
+    ptypes_t           *ptypes;  /**< array of basic predefined types that facilitate the computing
                                       of the remote size in heterogeneous environments. The length of the
                                       array is dependent on the maximum number of predefined datatypes of
                                       all language interfaces (because Fortran is not known at the OPAL
                                       layer). This field should never be initialized in homogeneous
                                       environments */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
-
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
+    uint16_t           ompi_id;
+    /* size: 200 or 208, and there are a few bytes empty after the ompi_id */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
@@ -339,6 +358,16 @@ opal_datatype_span( const opal_datatype_t* pData, size_t count,
     return true_extent + extent * (count - 1);
 }
 
+/*
+ * For use with any datatype whose setup involves its
+ * desc[] entries being pointed into opal_datatype_predefined_elem_desc[].
+ * That's a shared space where different OMPI types collide, so
+ * any type built using it needs to copy its desc[] and update
+ * the .ompi_id there.
+ */
+OPAL_DECLSPEC int opal_datatype_desc_update_reset(opal_datatype_t *pData);
+OPAL_DECLSPEC void opal_datatype_desc_update_free(opal_datatype_t *pData);
+
 #if OPAL_ENABLE_DEBUG
 /*
  * Set a breakpoint to this function in your favorite debugger
diff --git a/opal/datatype/opal_datatype_add.c b/opal/datatype/opal_datatype_add.c
index 7038d37a4c4..559685d26fe 100644
--- a/opal/datatype/opal_datatype_add.c
+++ b/opal/datatype/opal_datatype_add.c
@@ -279,11 +279,19 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA
      * predefined non contiguous datatypes (like MPI_SHORT_INT).
      */
     if( (pdtAdd->flags & (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA)) == (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA) ) {
-        if( NULL != pdtBase->ptypes )
-            pdtBase->ptypes[pdtAdd->id] += count;
+        if( NULL != pdtBase->ptypes ) {
+            pdtBase->ptypes->bdt_count[pdtAdd->id] += count;
+            if (pdtAdd->ompi_id != OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY) {
+                pdtBase->ptypes->ompi_dt_count[pdtAdd->ompi_id] += count;
+            } else {
+                /* opal_datatype_t don't have to be connected to an ompi_id */
+                pdtBase->ptypes->ompi_dt_count_is_valid = 0;
+            }
+        }
 
         pLast->elem.common.flags     = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED);
         pLast->elem.common.type      = pdtAdd->id;
+        pLast->elem.common.ompi_id   = pdtAdd->ompi_id;
         pLast->elem.disp             = disp;
         pLast->elem.extent           = (ptrdiff_t)count * extent;
         /* assume predefined datatypes without extent, aka. contiguous */
@@ -305,7 +313,20 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA
         pdtBase->flags |= (pdtAdd->flags & OPAL_DATATYPE_FLAG_USER_UB);
         if( (NULL != pdtBase->ptypes) && (NULL != pdtAdd->ptypes) ) {
             for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ )
-                if( pdtAdd->ptypes[i] != 0 ) pdtBase->ptypes[i] += (count * pdtAdd->ptypes[i]);
+                if( pdtAdd->ptypes->bdt_count[i] != 0 ) pdtBase->ptypes->bdt_count[i] += (count * pdtAdd->ptypes->bdt_count[i]); 
+            if (pdtBase->ptypes->ompi_dt_count_is_valid) {
+                if (pdtAdd->ptypes->ompi_dt_count_is_valid) {
+                    for( i = OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY+1; i < OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED; i++ )
+                        if( pdtAdd->ptypes->ompi_dt_count[i] != 0 )
+                            pdtBase->ptypes->ompi_dt_count[i] += (count * pdtAdd->ptypes->ompi_dt_count[i]); 
+                } else {
+                    pdtBase->ptypes->ompi_dt_count_is_valid = 0;
+                }
+            }
+        } else if (NULL != pdtBase->ptypes) {
+            /* (We had ptypes data but didn't update it, must invalidate) */
+            free(pdtBase->ptypes);
+            pdtBase->ptypes = NULL;
         }
         if( 1 == pdtAdd->desc.used ) {
             pLast->elem        = pdtAdd->desc.desc[0].elem;
diff --git a/opal/datatype/opal_datatype_dump.c b/opal/datatype/opal_datatype_dump.c
index 27903db657e..d76039a36e7 100644
--- a/opal/datatype/opal_datatype_dump.c
+++ b/opal/datatype/opal_datatype_dump.c
@@ -50,7 +50,7 @@ int opal_datatype_contain_basic_datatypes( const opal_datatype_t* pData, char* p
                 index += snprintf( ptr + index, length - index, "%s:* ", opal_datatype_basicDatatypes[i]->name );
             } else {
                 index += snprintf( ptr + index, length - index, "%s:%" PRIsize_t " ", opal_datatype_basicDatatypes[i]->name,
-                                   pData->ptypes[i]);
+                                   pData->ptypes->bdt_count[i]);
             }
         }
         mask <<= 1;
diff --git a/opal/datatype/opal_datatype_get_count.c b/opal/datatype/opal_datatype_get_count.c
index 12dba4d0d48..5dc952157ec 100644
--- a/opal/datatype/opal_datatype_get_count.c
+++ b/opal/datatype/opal_datatype_get_count.c
@@ -158,8 +158,16 @@ int opal_datatype_compute_ptypes( opal_datatype_t* datatype )
     dt_elem_desc_t* pElems;
 
     if( NULL != datatype->ptypes ) return 0;
-    datatype->ptypes = (size_t*)calloc(OPAL_DATATYPE_MAX_SUPPORTED, sizeof(size_t));
+    datatype->ptypes = (ptypes_t*)calloc(1, sizeof(ptypes_t));
 
+    datatype->ptypes->ompi_dt_count_is_valid = 1; /* until proven otherwise */
+/*
+ *  The ptypes.bdt_count[] entries don't have enough information to later figure out
+ *  what the remote size of the packed data would be.  If the datatype is
+ *  2 MPI_LONG, those counts will only tell us it's 2 OPAL_DATATYPE_UINT8_T.
+ *  So the new field .ompi_dt_count[] represents the same info but in terms
+ *  of ompi datatypes if possible.
+ */
     DUMP( "opal_datatype_compute_ptypes( %p )\n", (void*)datatype );
     pStack = (dt_stack_t*)alloca( sizeof(dt_stack_t) * (datatype->loops + 2) );
     pStack->count    = 1;
@@ -188,11 +196,19 @@ int opal_datatype_compute_ptypes( opal_datatype_t* datatype )
         }
         while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
             /* now here we have a basic datatype */
-            datatype->ptypes[pElems[pos_desc].elem.common.type] += (size_t)pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen;
-            nbElems += (size_t)pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen;
+            size_t mycount = (size_t)pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen;
+            datatype->ptypes->bdt_count[pElems[pos_desc].elem.common.type] += mycount;
+            nbElems += mycount;
+
+            int ompi_id = pElems[pos_desc].elem.common.ompi_id;
+            if (ompi_id != OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY) {
+                datatype->ptypes->ompi_dt_count[ompi_id] += mycount;
+            } else {
+                datatype->ptypes->ompi_dt_count_is_valid = 0;
+            }
 
             DUMP( "  compute_ptypes-add: type %d count %"PRIsize_t" (total type %u total %lld)\n",
-                  pElems[pos_desc].elem.common.type, datatype->ptypes[pElems[pos_desc].elem.common.type],
+                  pElems[pos_desc].elem.common.type, datatype->ptypes->bdt_count[pElems[pos_desc].elem.common.type],
                   pElems[pos_desc].elem.count, nbElems );
             pos_desc++;  /* advance to the next data */
         }
diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h
index b14acdf6168..e60c586f0b1 100644
--- a/opal/datatype/opal_datatype_internal.h
+++ b/opal/datatype/opal_datatype_internal.h
@@ -17,7 +17,7 @@
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
- * Copyright (c) 2021      IBM Corporation. All rights reserved.
+ * Copyright (c) 2020-2021 IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -147,6 +147,7 @@ BEGIN_C_DECLS
 struct ddt_elem_id_description {
     uint16_t   flags;  /**< flags for the record */
     uint16_t   type;   /**< the basic data type id */
+    uint16_t   ompi_id;
 };
 typedef struct ddt_elem_id_description ddt_elem_id_description;
 
@@ -221,10 +222,11 @@ union dt_elem_desc {
  * Create an element entry in the description. If the element is contiguous
  * collapse everything into the blocklen.
  */
-#define CREATE_ELEM(_place, _type, _flags, _blocklen, _count, _disp, _extent)  \
+#define CREATE_ELEM(_place, _type, _flags, _blocklen, _count, _disp, _extent, _ompi_id)  \
     do {                                                                       \
         (_place)->elem.common.flags = (_flags) | OPAL_DATATYPE_FLAG_DATA;      \
         (_place)->elem.common.type  = (_type);                                 \
+        (_place)->elem.common.ompi_id  = (_ompi_id);                           \
         (_place)->elem.blocklen     = (_blocklen);                             \
         (_place)->elem.count        = (_count);                                \
         (_place)->elem.extent       = (_extent);                               \
@@ -252,7 +254,11 @@ struct opal_datatype_t;
  */
 
 #define OPAL_DATATYPE_INIT_PTYPES_ARRAY_UNAVAILABLE NULL
-#define OPAL_DATATYPE_INIT_PTYPES_ARRAY(NAME) (size_t[OPAL_DATATYPE_MAX_PREDEFINED]){ [OPAL_DATATYPE_ ## NAME] = 1, [OPAL_DATATYPE_MAX_PREDEFINED-1] = 0 }
+#define OPAL_DATATYPE_INIT_PTYPES_ARRAY(NAME, SZ, MPI_SUFFIX) &(ptypes_t) { \
+        .bdt_count = { [OPAL_DATATYPE_ ## NAME] = 1, [OPAL_DATATYPE_MAX_PREDEFINED-1] = 0}, \
+        .ompi_dt_count = { [OMPI_DATATYPE_MPI_ ## MPI_SUFFIX ] = 1, [OPAL_MIRROR_OMPI_DATATYPE_MPI_MAX_PREDEFINED-1] = 0 }, \
+        .ompi_dt_count_is_valid = 1, \
+    }
 
 #define OPAL_DATATYPE_INIT_NAME(NAME) "OPAL_" #NAME
 
@@ -292,6 +298,7 @@ struct opal_datatype_t;
         .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
         .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
         .id = 0,                                                        \
+        .ompi_id = OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY,                 \
         .bdt_used = 0,                                                  \
         .size = 0,                                                      \
         .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
@@ -303,11 +310,12 @@ struct opal_datatype_t;
         .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY_UNAVAILABLE           \
     }
 
-#define OPAL_DATATYPE_INIT_BASIC_TYPE( TYPE, NAME, FLAGS )              \
+#define OPAL_DATATYPE_INIT_BASIC_TYPE( TYPE, NAME, FLAGS, OMPI_ID )     \
     {                                                                   \
         .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
         .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
         .id = TYPE,                                                     \
+        .ompi_id = OMPI_ID,                                             \
         .bdt_used = (((uint32_t)1)<<(TYPE)),                            \
         .size = 0,                                                      \
         .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
@@ -319,11 +327,12 @@ struct opal_datatype_t;
         .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY_UNAVAILABLE           \
     }
 
-#define OPAL_DATATYPE_INIT_BASIC_DATATYPE( TYPE, ALIGN, NAME, FLAGS )                \
+#define OPAL_DATATYPE_INIT_BASIC_DATATYPE( TYPE, ALIGN, NAME, FLAGS , OMPI_ID )      \
     {                                                                                \
         .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
         .flags = OPAL_DATATYPE_FLAG_BASIC | (FLAGS),                                 \
         .id = OPAL_DATATYPE_ ## NAME,                                                \
+        .ompi_id = OMPI_ID,                                                          \
         .bdt_used = (((uint32_t)1)<<(OPAL_DATATYPE_ ## NAME)),                       \
         .size = sizeof(TYPE),                                                        \
         .true_lb = 0, .true_ub = sizeof(TYPE), .lb = 0, .ub = sizeof(TYPE),          \
@@ -335,205 +344,205 @@ struct opal_datatype_t;
         .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY_UNAVAILABLE                        \
     }
 
-#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)                  \
+#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS, OMPI_ID)         \
              OPAL_DATATYPE_HANDLE_INT1(                        \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)                  \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS, OMPI_ID)         \
              OPAL_DATATYPE_HANDLE_INT2(                        \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)                  \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS, OMPI_ID)         \
              OPAL_DATATYPE_HANDLE_INT4(                        \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)                  \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS, OMPI_ID)         \
              OPAL_DATATYPE_HANDLE_INT8(                        \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)                 \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS, OMPI_ID)        \
              OPAL_DATATYPE_HANDLE_INT16(                       \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)                 \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS, OMPI_ID)        \
              OPAL_DATATYPE_HANDLE_UINT1(                       \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)                 \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS, OMPI_ID)        \
              OPAL_DATATYPE_HANDLE_UINT2(                       \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)                 \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS, OMPI_ID)        \
              OPAL_DATATYPE_HANDLE_UINT4(                       \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)                 \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS, OMPI_ID)        \
              OPAL_DATATYPE_HANDLE_UINT8(                       \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS, OMPI_ID)       \
              OPAL_DATATYPE_HANDLE_UINT16(                      \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS, OMPI_ID)       \
              OPAL_DATATYPE_HANDLE_FLOAT2(                      \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS, OMPI_ID)       \
              OPAL_DATATYPE_HANDLE_FLOAT4(                      \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS, OMPI_ID)       \
              OPAL_DATATYPE_HANDLE_FLOAT8(                      \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)               \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS, OMPI_ID)      \
              OPAL_DATATYPE_HANDLE_FLOAT12(                     \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)               \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS, OMPI_ID)      \
              OPAL_DATATYPE_HANDLE_FLOAT16(                     \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS)   \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS, OMPI_ID)   \
              OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(         \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS)         \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS, OMPI_ID) \
              OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(               \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS)        \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS, OMPI_ID) \
              OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(              \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS)   \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS, OMPI_ID) \
              OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(         \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)                  \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS, OMPI_ID)         \
              OPAL_DATATYPE_HANDLE_BOOL(                        \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)                 \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS, OMPI_ID)        \
              OPAL_DATATYPE_HANDLE_WCHAR(                       \
              OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
-             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
-#define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP_S, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, LOOP_E, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS )
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS, OMPI_ID)
+#define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS, OMPI_ID)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP_S, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS, OMPI_ID)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, LOOP_E, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_INITIALIZER_LB(FLAGS, OMPI_ID)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_INITIALIZER_UB(FLAGS, OMPI_ID)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS, OMPI_ID )
 
 
-#define OPAL_DATATYPE_HANDLE_INT1(AV, NOTAV, FLAGS)       AV( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
-#define OPAL_DATATYPE_HANDLE_INT2(AV, NOTAV, FLAGS)       AV( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
-#define OPAL_DATATYPE_HANDLE_INT4(AV, NOTAV, FLAGS)       AV( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
-#define OPAL_DATATYPE_HANDLE_INT8(AV, NOTAV, FLAGS)       AV( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT1(AV, NOTAV, FLAGS, OMPI_ID)       AV( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_HANDLE_INT2(AV, NOTAV, FLAGS, OMPI_ID)       AV( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_HANDLE_INT4(AV, NOTAV, FLAGS, OMPI_ID)       AV( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_HANDLE_INT8(AV, NOTAV, FLAGS, OMPI_ID)       AV( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS, OMPI_ID )
 #ifdef HAVE_INT128_T
-#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS)      AV( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS, OMPI_ID)      AV( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS)      NOTAV( INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS, OMPI_ID)      NOTAV( INT16, FLAGS )
 #endif
-#define OPAL_DATATYPE_HANDLE_UINT1(AV, NOTAV, FLAGS)      AV( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
-#define OPAL_DATATYPE_HANDLE_UINT2(AV, NOTAV, FLAGS)      AV( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
-#define OPAL_DATATYPE_HANDLE_UINT4(AV, NOTAV, FLAGS)      AV( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
-#define OPAL_DATATYPE_HANDLE_UINT8(AV, NOTAV, FLAGS)      AV( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT1(AV, NOTAV, FLAGS, OMPI_ID)      AV( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_HANDLE_UINT2(AV, NOTAV, FLAGS, OMPI_ID)      AV( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_HANDLE_UINT4(AV, NOTAV, FLAGS, OMPI_ID)      AV( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS, OMPI_ID )
+#define OPAL_DATATYPE_HANDLE_UINT8(AV, NOTAV, FLAGS, OMPI_ID)      AV( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS, OMPI_ID )
 #ifdef HAVE_UINT128_T
-#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS)     AV( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS, OMPI_ID)     AV( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS)     NOTAV( INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS, OMPI_ID)     NOTAV( INT16, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 2
-#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS, OMPI_ID)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS, OMPI_ID )
 #elif SIZEOF_FLOAT == 2
-#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS, OMPI_ID)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS, OMPI_ID )
 #elif SIZEOF_DOUBLE == 2
-#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS, OMPI_ID)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS, OMPI_ID )
 #elif SIZEOF_LONG_DOUBLE == 2
-#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS, OMPI_ID)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS, OMPI_ID )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 2
-#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS, OMPI_ID)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT2, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     NOTAV( FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS, OMPI_ID)     NOTAV( FLOAT2, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 4
-#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS, OMPI_ID)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT4, FLAGS, OMPI_ID )
 #elif SIZEOF_FLOAT == 4
-#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS, OMPI_ID)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS, OMPI_ID )
 #elif SIZEOF_DOUBLE == 4
-#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS, OMPI_ID)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS, OMPI_ID )
 #elif SIZEOF_LONG_DOUBLE == 4
-#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS, OMPI_ID)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS, OMPI_ID )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 4
-#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS, OMPI_ID)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT4, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     NOTAV( FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS, OMPI_ID)     NOTAV( FLOAT4, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 8
-#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS, OMPI_ID)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT8, FLAGS, OMPI_ID )
 #elif SIZEOF_FLOAT == 8
-#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS, OMPI_ID)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS, OMPI_ID )
 #elif SIZEOF_DOUBLE == 8
-#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS, OMPI_ID)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS, OMPI_ID )
 #elif SIZEOF_LONG_DOUBLE == 8
-#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS, OMPI_ID)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS, OMPI_ID )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 8
-#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS, OMPI_ID)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT8, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     NOTAV( FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS, OMPI_ID)     NOTAV( FLOAT8, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 12
-#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS, OMPI_ID)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT12, FLAGS, OMPI_ID )
 #elif SIZEOF_FLOAT == 12
-#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS, OMPI_ID)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS, OMPI_ID )
 #elif SIZEOF_DOUBLE == 12
-#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS, OMPI_ID)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS, OMPI_ID )
 #elif SIZEOF_LONG_DOUBLE == 12
-#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS, OMPI_ID)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS, OMPI_ID )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 12
-#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS, OMPI_ID)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT12, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    NOTAV( FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS, OMPI_ID)    NOTAV( FLOAT12, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 16
-#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS, OMPI_ID)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT16, FLAGS, OMPI_ID )
 #elif SIZEOF_FLOAT == 16
-#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS, OMPI_ID)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS, OMPI_ID )
 #elif SIZEOF_DOUBLE == 16
-#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS, OMPI_ID)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS, OMPI_ID )
 #elif SIZEOF_LONG_DOUBLE == 16
-#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS, OMPI_ID)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS, OMPI_ID )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 16
-#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS, OMPI_ID)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT16, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    NOTAV( FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS, OMPI_ID)    NOTAV( FLOAT16, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
-#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( short float _Complex, OPAL_ALIGNMENT_SHORT_FLOAT_COMPLEX, SHORT_FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS, OMPI_ID) AV( short float _Complex, OPAL_ALIGNMENT_SHORT_FLOAT_COMPLEX, SHORT_FLOAT_COMPLEX, FLAGS, OMPI_ID )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
-#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( opal_short_float_complex_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, SHORT_FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS, OMPI_ID) AV( opal_short_float_complex_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, SHORT_FLOAT_COMPLEX, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) NOTAV( SHORT_FLOAT_COMPLEX, FLAGS)
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS, OMPI_ID) NOTAV( SHORT_FLOAT_COMPLEX, FLAGS )
 #endif
 
-#define OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(AV, NOTAV, FLAGS, OMPI_ID) AV( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS, OMPI_ID )
 
-#define OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) AV( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(AV, NOTAV, FLAGS, OMPI_ID) AV( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS, OMPI_ID )
 
-#define OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) AV( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(AV, NOTAV, FLAGS, OMPI_ID) AV( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS, OMPI_ID )
 
-#define OPAL_DATATYPE_HANDLE_BOOL(AV, NOTAV, FLAGS)       AV( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
+#define OPAL_DATATYPE_HANDLE_BOOL(AV, NOTAV, FLAGS, OMPI_ID)       AV( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS, OMPI_ID )
 
 #if OPAL_ALIGNMENT_WCHAR != 0
-#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS)      AV( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
+#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS, OMPI_ID)      AV( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS, OMPI_ID )
 #else
-#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS)      NOTAV( WCHAR, FLAGS )
+#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS, OMPI_ID)      NOTAV( WCHAR, FLAGS )
 #endif
 
 #define BASIC_DDT_FROM_ELEM( ELEM ) (opal_datatype_basicDatatypes[(ELEM).elem.common.type])
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 1ce57e67b50..f2cb165833d 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -20,6 +20,7 @@
  * Copyright (c) 2018      Triad National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -58,31 +59,31 @@ extern int opal_cuda_verbose;
  */
 OPAL_DECLSPEC const opal_datatype_t opal_datatype_empty =       OPAL_DATATYPE_INITIALIZER_EMPTY(OPAL_DATATYPE_FLAG_CONTIGUOUS);
 
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_loop =        OPAL_DATATYPE_INITIALIZER_LOOP(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_end_loop =    OPAL_DATATYPE_INITIALIZER_END_LOOP(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_lb =          OPAL_DATATYPE_INITIALIZER_LB(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_ub =          OPAL_DATATYPE_INITIALIZER_UB(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_int1 =        OPAL_DATATYPE_INITIALIZER_INT1(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_int2 =        OPAL_DATATYPE_INITIALIZER_INT2(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_int4 =        OPAL_DATATYPE_INITIALIZER_INT4(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_int8 =        OPAL_DATATYPE_INITIALIZER_INT8(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_int16 =       OPAL_DATATYPE_INITIALIZER_INT16(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint1 =       OPAL_DATATYPE_INITIALIZER_UINT1(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint2 =       OPAL_DATATYPE_INITIALIZER_UINT2(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint4 =       OPAL_DATATYPE_INITIALIZER_UINT4(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint8 =       OPAL_DATATYPE_INITIALIZER_UINT8(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint16 =      OPAL_DATATYPE_INITIALIZER_UINT16(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_float2 =      OPAL_DATATYPE_INITIALIZER_FLOAT2(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_float4 =      OPAL_DATATYPE_INITIALIZER_FLOAT4(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_float8 =      OPAL_DATATYPE_INITIALIZER_FLOAT8(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_float12 =     OPAL_DATATYPE_INITIALIZER_FLOAT12(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_float16 =     OPAL_DATATYPE_INITIALIZER_FLOAT16(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_short_float_complex = OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_float_complex = OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_double_complex = OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_long_double_complex = OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_bool =        OPAL_DATATYPE_INITIALIZER_BOOL(0);
-OPAL_DECLSPEC const opal_datatype_t opal_datatype_wchar =       OPAL_DATATYPE_INITIALIZER_WCHAR(0);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_loop =        OPAL_DATATYPE_INITIALIZER_LOOP(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_end_loop =    OPAL_DATATYPE_INITIALIZER_END_LOOP(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_lb =          OPAL_DATATYPE_INITIALIZER_LB(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_ub =          OPAL_DATATYPE_INITIALIZER_UB(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_int1 =        OPAL_DATATYPE_INITIALIZER_INT1(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_int2 =        OPAL_DATATYPE_INITIALIZER_INT2(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_int4 =        OPAL_DATATYPE_INITIALIZER_INT4(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_int8 =        OPAL_DATATYPE_INITIALIZER_INT8(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_int16 =       OPAL_DATATYPE_INITIALIZER_INT16(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint1 =       OPAL_DATATYPE_INITIALIZER_UINT1(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint2 =       OPAL_DATATYPE_INITIALIZER_UINT2(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint4 =       OPAL_DATATYPE_INITIALIZER_UINT4(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint8 =       OPAL_DATATYPE_INITIALIZER_UINT8(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_uint16 =      OPAL_DATATYPE_INITIALIZER_UINT16(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_float2 =      OPAL_DATATYPE_INITIALIZER_FLOAT2(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_float4 =      OPAL_DATATYPE_INITIALIZER_FLOAT4(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_float8 =      OPAL_DATATYPE_INITIALIZER_FLOAT8(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_float12 =     OPAL_DATATYPE_INITIALIZER_FLOAT12(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_float16 =     OPAL_DATATYPE_INITIALIZER_FLOAT16(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_short_float_complex = OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_float_complex = OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_double_complex = OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_long_double_complex = OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_bool =        OPAL_DATATYPE_INITIALIZER_BOOL(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
+OPAL_DECLSPEC const opal_datatype_t opal_datatype_wchar =       OPAL_DATATYPE_INITIALIZER_WCHAR(0, OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY);
 OPAL_DECLSPEC const opal_datatype_t opal_datatype_unavailable = OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED(UNAVAILABLE, 0);
 
 OPAL_DECLSPEC dt_elem_desc_t opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_MAX_PREDEFINED] = {{{{0}}}};
@@ -261,6 +262,7 @@ int32_t opal_datatype_init( void )
                                                    OPAL_DATATYPE_FLAG_CONTIGUOUS |
                                                    OPAL_DATATYPE_FLAG_NO_GAPS;
         datatype->desc.desc[0].elem.common.type  = i;
+        datatype->desc.desc[0].elem.common.ompi_id  = OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY;
         datatype->desc.desc[0].elem.count        = 1;
         datatype->desc.desc[0].elem.blocklen     = 1;
         datatype->desc.desc[0].elem.disp         = 0;
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 2e661b95daa..2cfe2aa5172 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -58,7 +58,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
             ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop);
             if( 0 != last.count ) {
                 CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
-                             last.blocklen, last.count, last.disp, last.extent );
+                             last.blocklen, last.count, last.disp, last.extent, last.common.ompi_id );
                 pElemDesc++; nbElems++;
                 last.count= 0;
             }
@@ -123,7 +123,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
 
             if( 0 != last.count ) {  /* Generate the pending element */
                 CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
-                             last.blocklen, last.count, last.disp, last.extent );
+                             last.blocklen, last.count, last.disp, last.extent, last.common.ompi_id );
                 pElemDesc++; nbElems++;
                 last.count       = 0;
                 last.common.type = OPAL_DATATYPE_LOOP;
@@ -136,7 +136,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
                     for( uint32_t j = 0; j < (loop->items - 1); j++ ) {
                         current = &pData->desc.desc[pos_desc + index + j].elem;
                         CREATE_ELEM( pElemDesc, current->common.type, current->common.flags,
-                                     current->blocklen, current->count, current->disp + elem_displ, current->extent );
+                                     current->blocklen, current->count, current->disp + elem_displ, current->extent, current->common.ompi_id );
                         pElemDesc++; nbElems++;
                     }
                     elem_displ += loop->extent;
@@ -217,7 +217,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
                 current->disp ) {
                 if( last.count != 1 ) {
                     CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
-                                 last.blocklen, last.count - 1, last.disp, last.extent );
+                                 last.blocklen, last.count - 1, last.disp, last.extent, last.common.ompi_id );
                     pElemDesc++; nbElems++;
                     last.disp += (last.count - 1) * last.extent;
                     last.count = 1;
@@ -232,7 +232,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
                 last.extent += current->extent;
                 if( current->count != 1 ) {
                     CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
-                                 last.blocklen, last.count, last.disp, last.extent );
+                                 last.blocklen, last.count, last.disp, last.extent, last.common.ompi_id );
                     pElemDesc++; nbElems++;
                     last = *current;
                     last.count -= 1;
@@ -241,7 +241,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
                 continue;
             }
             CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
-                         last.blocklen, last.count, last.disp, last.extent );
+                         last.blocklen, last.count, last.disp, last.extent, last.common.ompi_id );
             pElemDesc++; nbElems++;
             last = *current;
         }
@@ -249,7 +249,7 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
 
     if( 0 != last.count ) {
         CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
-                     last.blocklen, last.count, last.disp, last.extent );
+                     last.blocklen, last.count, last.disp, last.extent, last.common.ompi_id );
         pElemDesc++; nbElems++;
     }
     /* cleanup the stack */
diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h
index c516feb511d..87335a625ec 100644
--- a/opal/datatype/opal_datatype_pack_unpack_predefined.h
+++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h
@@ -298,14 +298,14 @@ opal_datatype_unpack_predefined_element( unsigned char** rtn_src,
  *  Otherwise we'd have to copy and maintain essentially the same blob of
  *  macros that already exist in opal_datatype_internal.h.
  */
-#define OPAL_DATATYPE_MYUNPACK(NAME)               \
-    do {                                           \
-        OPAL_DATATYPE_HANDLE_ ## NAME(             \
-            OPAL_DATATYPE_MYUNPACK_AVAILABLE,      \
-            OPAL_DATATYPE_MYUNPACK_NOTAVAIL, 0);   \
+#define OPAL_DATATYPE_MYUNPACK(NAME)                \
+    do {                                            \
+        OPAL_DATATYPE_HANDLE_ ## NAME(              \
+            OPAL_DATATYPE_MYUNPACK_AVAILABLE,       \
+            OPAL_DATATYPE_MYUNPACK_NOTAVAIL, 0, 0); \
     } while (0)
 
-#define OPAL_DATATYPE_MYUNPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
+#define OPAL_DATATYPE_MYUNPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused, unused2) \
     do { \
         OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
         success = true; \
@@ -436,10 +436,10 @@ opal_datatype_pack_predefined_element( unsigned char** rtn_src,
     do {                                           \
         OPAL_DATATYPE_HANDLE_ ## NAME(             \
             OPAL_DATATYPE_MYPACK_AVAILABLE,        \
-            OPAL_DATATYPE_MYPACK_NOTAVAIL, 0);     \
+            OPAL_DATATYPE_MYPACK_NOTAVAIL, 0, 0);  \
     } while (0)
 
-#define OPAL_DATATYPE_MYPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
+#define OPAL_DATATYPE_MYPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused, unused2) \
     do { \
         OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
         success = true; \
diff --git a/opal/datatype/opal_datatype_reset_predef.c b/opal/datatype/opal_datatype_reset_predef.c
new file mode 100644
index 00000000000..0454846aa3f
--- /dev/null
+++ b/opal/datatype/opal_datatype_reset_predef.c
@@ -0,0 +1,99 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+
+#include <stddef.h>
+
+#include "opal/runtime/opal.h"
+#include "opal/util/arch.h"
+#include "opal/util/output.h"
+#include "opal/datatype/opal_datatype_internal.h"
+#include "opal/datatype/opal_datatype.h"
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/mca/base/mca_base_var.h"
+
+int
+opal_datatype_desc_update_reset(opal_datatype_t *pData)
+{
+    int opt_equals_desc;
+
+    void *p1 = opal_datatype_predefined_elem_desc;
+    void *p2 = &opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_MAX_PREDEFINED];
+    if ((void*)pData->desc.desc >= p1 && (void*)pData->desc.desc <= p2) {
+        if (pData->desc.desc != pData->opt_desc.desc) {
+            opt_equals_desc = 0;
+        } else {
+            opt_equals_desc = 1;
+        }
+/*
+ *  This first section is privatizing the description.
+ *  The .desc fields for the OPAL contents of the predefined
+ *  OMPI types are initialized as offsets into
+ *      opal_datatype_predefined_elem_desc[].
+ *  with multiple OMPI types sharing the same offset.
+ *
+ *  In order for them to each get their own data (for .ompi_id)
+ *  the starting .desc needs copied into its own space.
+ */
+        dt_elem_desc_t *new = malloc(2 * sizeof(dt_elem_desc_t));
+        if (!new) { return -1; }
+        memcpy(new, pData->desc.desc, 2 * sizeof(dt_elem_desc_t));
+
+        pData->desc.desc = new;
+        if (opt_equals_desc) {
+            pData->opt_desc.desc = new;
+        }
+/*
+ *  Now we can set the .ompi_id down in the description.
+ *  For the predefined types the relevant entry is desc[0].
+ *  The initial values for opal_datatype_predefined_elem_desc[]
+ *  came from opal_datatype_init() where it did
+ *    datatype->desc.desc[0].elem.common.type  = i;
+ */
+        pData->desc.desc[0].elem.common.ompi_id = pData->ompi_id;;
+        pData->flags |= OPAL_DATATYPE_FLAG_DESC_WAS_REALLOCATED;
+    }
+
+    return OPAL_SUCCESS;
+}
+
+/*
+ *  Free the .desc field if it was reset by
+ *  opal_datatype_desc_update_reset()
+ *
+ *  What's going on with desc vs opt_desc is they can be
+ *  equal or differ.  If they're equal they need to stay
+ *  equal across these modifications of the description.
+ *  Later in opal_datatype_destruct() it takes alternate
+ *  paths for same vs different desc/opt_desc.
+ */
+void
+opal_datatype_desc_update_free(opal_datatype_t *pData)
+{
+    int opt_equals_desc;
+
+    if (pData->flags & OPAL_DATATYPE_FLAG_DESC_WAS_REALLOCATED) {
+        if (pData->desc.desc != pData->opt_desc.desc) {
+            opt_equals_desc = 0;
+        } else {
+            opt_equals_desc = 1;
+        }
+
+        if (pData->desc.desc) {
+            free(pData->desc.desc);
+            pData->desc.desc = NULL;
+        }
+        if (opt_equals_desc) {
+            pData->opt_desc.desc = pData->desc.desc;
+        }
+        pData->flags ^= OPAL_DATATYPE_FLAG_DESC_WAS_REALLOCATED;
+    }
+}
diff --git a/opal/datatype/opal_datatype_setexternal32.c b/opal/datatype/opal_datatype_setexternal32.c
new file mode 100644
index 00000000000..b04feee28b5
--- /dev/null
+++ b/opal/datatype/opal_datatype_setexternal32.c
@@ -0,0 +1,32 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+
+#include <stddef.h>
+
+#include "opal/runtime/opal.h"
+#include "opal/util/arch.h"
+#include "opal/util/output.h"
+#include "opal/datatype/opal_datatype_internal.h"
+#include "opal/datatype/opal_datatype.h"
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/mca/base/mca_base_var.h"
+
+void
+opal_convertor_set_ompi_remote_size(opal_convertor_t *convertor, int idx, int size)
+{
+    convertor->master->ompi_remote_sizes[idx] = size;
+}
+void
+opal_convertor_ompi_remote_size_is_ready(opal_convertor_t *convertor)
+{
+    convertor->master->ompi_remote_sizes_is_set = 1;
+}
diff --git a/test/datatype/ddt_pack.c b/test/datatype/ddt_pack.c
index 60d5fd278d1..35575031d2a 100644
--- a/test/datatype/ddt_pack.c
+++ b/test/datatype/ddt_pack.c
@@ -88,7 +88,15 @@ main(int argc, char* argv[])
     unpacked_dt = ompi_datatype_create_from_packed_description(&payload,
                                                                ompi_proc_local());
     free(ptr);
-    if (unpacked_dt == &ompi_mpi_int32_t.dt) {
+    /*
+     *  Update, this test used to expecte the resturned unpacked_dt
+     *  to be ompi_mpi_int32_t.dt, probably because we used to have
+     *  #define OMPI_DATATYPE_MPI_INT OMPI_DATATYPE_MPI_INT32_T
+     *  so the only base OMPI datatype represented in the ID numbers
+     *  was for MPI_INT32_T.  This PR separated the IDs so external32
+     *  can have the sizes diverge.
+     */
+    if (unpacked_dt == &ompi_mpi_int.dt) {
         printf("\tPASSED\n");
     } else {
         printf("\tFAILED: datatypes don't match\n");
@@ -438,7 +446,7 @@ main(int argc, char* argv[])
         }
         printf("\tPASSED\n");
     }
-    if (unpacked_dt == &ompi_mpi_int32_t.dt) {
+    if (unpacked_dt == &ompi_mpi_int.dt) {
         printf("\tPASSED\n");
     } else {
         printf("\tFAILED: datatypes don't match\n");

From 1aebdec72101a33ef8a2becce30f5efe60884028 Mon Sep 17 00:00:00 2001
From: Mark Allen <markalle@us.ibm.com>
Date: Tue, 7 Jul 2020 12:48:42 -0400
Subject: [PATCH 2/3] external32 fixes

* let external32 use the non-optimized datatype description so
  it can know what it's supposed to be packing
* redo the pFunctions arguments to take a vector
  (ddt_elem_desc_t) to be iterated over and a packed buf
  as its main arguments
* VECTOR traversal utility to walk an .elem from a datatype's
  description (using conv_ptr and count_desc same as elsewhere)

Here's a gist for a testcase pack2.c
https://gist.github.com/markalle/6d70cf8ca14761e94bce9d0240000a3e
% mpicc -o x pack2.c
% mpirun -np 1 ./x

Without this fix if sizeof(long) is 8 and a machine is little endian
an input buffer of (long)1 == [01 00 00 00  00 00 00 00] packs external32
as [00 00 00 00  00 00 00 01], using 8 bytes.  But the external32
representation for MPI_LONG is supposed to be 4 bytes.

This commit uses .ompi_id and .ompi_remote_sizes[] if available
to figure out what the external32 size should be (_r_blength).

In the functions like copy_int8_heterogenous() that are made from
pack/unpack with pFunction[], vector walking macros are used to
walk the vector buffer and the byte swapping conversion function
has more arguments to know the size and endianness of both the
"from" and "to" element.

A few other misc changes that went into this:
- changed the bit location for OPAL_ARCH_LONGIS64 because it wasn't living
  under the OPAL_ARCH_LONGISxx mask which appears to be how those masks
  are designed to work
- the terminating condition in pack used to say "bConverted == local_size"
  but bConverted is counting bytes on the "to" side and for pack the "to"
  side is the remote_size.

Signed-off-by: Mark Allen <markalle@us.ibm.com>
---
 opal/datatype/opal_convertor.c                |  21 +-
 opal/datatype/opal_convertor_internal.h       |  15 +-
 opal/datatype/opal_copy_functions.c           | 138 +++----
 .../opal_copy_functions_heterogeneous.c       | 349 +++++++++++++-----
 opal/datatype/opal_datatype_internal.h        | 104 ++++++
 opal/datatype/opal_datatype_pack.c            |  28 +-
 opal/datatype/opal_datatype_unpack.c          |  29 +-
 opal/util/arch.h                              |   3 +-
 8 files changed, 498 insertions(+), 189 deletions(-)

diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index ae2d0ac31a3..718f747d525 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -15,6 +15,7 @@
  * Copyright (c) 2013-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2017      Intel, Inc. All rights reserved
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -521,8 +522,26 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
     
     pConvertor->remote_size = pConvertor->local_size;
     if( OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask) ) {
+        int is_send_conversion = 0;
+        if (pConvertor->flags & CONVERTOR_SEND_CONVERSION) {
+            /*
+             * Adding to the conditions for keeping the optimized description.
+             * Now it's only optimized if (send && contiguous &&
+             * !something like external32 that needs conversion)
+             *
+             * Note, elsewhere there are similar checks that boil down to
+             * checking that CONVERTOR_SEND_CONVERSION is on but that
+             * HOMOGENEOUS is off.  That kind of makes sense, except
+             * OPAL_CONVERTOR_PREPARE seems to universally set HOMOGENEOUS
+             * so I don't think that setting means what it looks like it
+             * means, so I'm not using it.
+             */
+            is_send_conversion = 1;
+        }
         pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS);
-        if (!(pConvertor->flags & CONVERTOR_SEND && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS)) {
+        if (!(pConvertor->flags & CONVERTOR_SEND && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS
+            && !is_send_conversion))
+        {
             pConvertor->use_desc = &(datatype->desc);
         }
         if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
diff --git a/opal/datatype/opal_convertor_internal.h b/opal/datatype/opal_convertor_internal.h
index 94622060743..ff4dda73764 100644
--- a/opal/datatype/opal_convertor_internal.h
+++ b/opal/datatype/opal_convertor_internal.h
@@ -7,6 +7,7 @@
  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,13 +20,19 @@
 #include "opal_config.h"
 
 #include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_internal.h"
 
 BEGIN_C_DECLS
 
-typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
-                                     const void* from, size_t from_len, ptrdiff_t from_extent,
-                                     void* to, size_t to_length, ptrdiff_t to_extent,
-                                     ptrdiff_t *advance );
+/* returns elements processed in the packed_buf for this call */
+typedef size_t (*conversion_fct_t)( opal_convertor_t* pConvertor,
+                                     int mode,
+                                     const ddt_elem_desc_t *elem,
+                                     char** pconv_ptr,
+                                     size_t* pcount_desc,
+                                     char** ppacked_buf,
+                                     size_t* packed_len,
+                                     size_t packed_element_size);
 
 typedef struct opal_convertor_master_t {
     struct opal_convertor_master_t* next;
diff --git a/opal/datatype/opal_copy_functions.c b/opal/datatype/opal_copy_functions.c
index ad315f787e4..cbab1144b2f 100644
--- a/opal/datatype/opal_copy_functions.c
+++ b/opal/datatype/opal_copy_functions.c
@@ -8,6 +8,7 @@
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -39,42 +40,40 @@
  *
  * Return value: Number of elements of type TYPE copied
  */
-#define COPY_TYPE( TYPENAME, TYPE, COUNT )                                              \
-static int copy_##TYPENAME( opal_convertor_t *pConvertor, size_t count,                 \
-                            char* from, size_t from_len, ptrdiff_t from_extent,         \
-                            char* to, size_t to_len, ptrdiff_t to_extent,               \
-                            ptrdiff_t *advance)                                         \
-{                                                                                       \
-    size_t remote_TYPE_size = sizeof(TYPE) * (COUNT); /* TODO */                        \
-    size_t local_TYPE_size = (COUNT) * sizeof(TYPE);                                    \
-                                                                                        \
-    /* make sure the remote buffer is large enough to hold the data */                  \
-    if( (remote_TYPE_size * count) > from_len ) {                                       \
-        count = from_len / remote_TYPE_size;                                            \
-        if( (count * remote_TYPE_size) != from_len ) {                                  \
-            DUMP( "oops should I keep this data somewhere (excedent %d bytes)?\n",      \
-                  from_len - (count * remote_TYPE_size) );                              \
-        }                                                                               \
-        DUMP( "correct: copy %s count %d from buffer %p with length %d to %p space %d\n", \
-              #TYPE, count, from, from_len, to, to_len );                               \
-    } else                                                                              \
-        DUMP( "         copy %s count %d from buffer %p with length %d to %p space %d\n", \
-              #TYPE, count, from, from_len, to, to_len );                               \
-                                                                                        \
-    if( (from_extent == (ptrdiff_t)local_TYPE_size) &&                          \
-        (to_extent == (ptrdiff_t)remote_TYPE_size) ) {                          \
-        /* copy of contigous data at both source and destination */                     \
-        MEMCPY( to, from, count * local_TYPE_size );                                    \
-    } else {                                                                            \
-        /* source or destination are non-contigous */                                   \
-        for(size_t i = 0; i < count; i++ ) {                                            \
-            MEMCPY( to, from, local_TYPE_size );                                        \
-            to += to_extent;                                                            \
-            from += from_extent;                                                        \
-        }                                                                               \
-    }                                                                                   \
-    *advance = count * from_extent;                                                     \
-    return count;                                                                       \
+#define COPY_TYPE( TYPENAME, TYPE, COUNT )                               \
+static int copy_##TYPENAME( opal_convertor_t *pConvertor,                \
+                            int mode,                                    \
+                            ddt_elem_desc_t *elem,                       \
+                            size_t* pconv_ptr,                           \
+                            size_t* pcount_desc,                         \
+                            char** ppacked_buf,                          \
+                            size_t* ppacked_len,                         \
+                            size_t element_size)                         \
+{                                                                        \
+    vector_iterator_state_t vec;                                         \
+    size_t i;                                                            \
+    int opal_type = elem->common.type;                                   \
+    vector_iter_load_current_state(&vec, elem, pconv_ptr, pcount_desc);  \
+    char **pfrom, **pto;                                                 \
+    if (mode == COPY_TO_VECTOR) {                                        \
+        pto = &vec.buf,                                                  \
+        pfrom = ppacked_buf;                                             \
+    } else {                                                             \
+        pfrom = &vec.buf;                                                \
+        pto = ppacked_buf;                                               \
+    }                                                                    \
+                                                                         \
+    while (*pcount_desc != 0 && *ppacked_len >= element_size) {          \
+        size_t mycount = vec.count;                                      \
+        if (mycount * element_size > *ppacked_len) {                     \
+            mycount = *ppacked_len / element_size;                       \
+        }                                                                \
+        MEMCPY(*pto, *pfrom, mycount * element_size );                   \
+        vector_iter_consume(&vec, mycount);                              \
+        *ppacked_buf += mycount * element_size;                          \
+        *ppacked_len -= mycount * element_size;                          \
+     }                                                                   \
+    return 0;                                                            \
 }
 
 /*
@@ -91,39 +90,40 @@ static int copy_##TYPENAME( opal_convertor_t *pConvertor, size_t count,
  *
  * Return value: Number of elements of type TYPE copied
  */
-#define COPY_CONTIGUOUS_BYTES( TYPENAME, COUNT )                                          \
-static size_t copy_##TYPENAME##_##COUNT( opal_convertor_t *pConvertor, size_t count,         \
-                                         char* from, size_t from_len, ptrdiff_t from_extent, \
-                                         char* to, size_t to_len, ptrdiff_t to_extent,       \
-                                         ptrdiff_t *advance )              \
-{                                                                               \
-    size_t remote_TYPE_size = (size_t)(COUNT); /* TODO */                       \
-    size_t local_TYPE_size = (size_t)(COUNT);                                   \
-                                                                                \
-    if( (remote_TYPE_size * count) > from_len ) {                               \
-        count = from_len / remote_TYPE_size;                                    \
-        if( (count * remote_TYPE_size) != from_len ) {                          \
-            DUMP( "oops should I keep this data somewhere (excedent %d bytes)?\n", \
-                  from_len - (count * remote_TYPE_size) );                      \
-        }                                                                       \
-        DUMP( "correct: copy %s count %d from buffer %p with length %d to %p space %d\n", \
-              #TYPENAME, count, from, from_len, to, to_len );                   \
-    } else                                                                      \
-        DUMP( "         copy %s count %d from buffer %p with length %d to %p space %d\n", \
-              #TYPENAME, count, from, from_len, to, to_len );                   \
-                                                                                \
-    if( (from_extent == (ptrdiff_t)local_TYPE_size) &&                  \
-        (to_extent == (ptrdiff_t)remote_TYPE_size) ) {                  \
-        MEMCPY( to, from, count * local_TYPE_size );                            \
-    } else {                                                                    \
-        for(size_t i = 0; i < count; i++ ) {                                    \
-            MEMCPY( to, from, local_TYPE_size );                                \
-            to += to_extent;                                                    \
-            from += from_extent;                                                \
-        }                                                                       \
-    }                                                                           \
-    *advance = count * from_extent;                                             \
-    return count;                                                               \
+#define COPY_CONTIGUOUS_BYTES( TYPENAME, COUNT )                         \
+static size_t copy_##TYPENAME##_##COUNT( opal_convertor_t *pConvertor,   \
+                            int mode,                                    \
+                            ddt_elem_desc_t *elem,                       \
+                            size_t* pconv_ptr,                           \
+                            size_t* pcount_desc,                         \
+                            char** ppacked_buf,                          \
+                            size_t* ppacked_len,                         \
+                            size_t element_size)                         \
+{                                                                        \
+    vector_iterator_state_t vec;                                         \
+    size_t i;                                                            \
+    int opal_type = elem->common.type;                                   \
+    vector_iter_load_current_state(&vec, elem, pconv_ptr, pcount_desc);  \
+    char **pfrom, **pto;                                                 \
+    if (mode == COPY_TO_VECTOR) {                                        \
+        pto = &vec.buf,                                                  \
+        pfrom = ppacked_buf;                                             \
+    } else {                                                             \
+        pfrom = &vec.buf;                                                \
+        pto = ppacked_buf;                                               \
+    }                                                                    \
+                                                                         \
+    while (*pcount_desc != 0 && *ppacked_len >= element_size) {          \
+        size_t mycount = vec.count;                                      \
+        if (mycount * element_size > *ppacked_len) {                     \
+            mycount = *ppacked_len / element_size;                       \
+        }                                                                \
+        MEMCPY(*pto, *pfrom, mycount * element_size );                   \
+        vector_iter_consume(&vec, mycount);                              \
+        *ppacked_buf += mycount * element_size;                          \
+        *ppacked_len -= mycount * element_size;                          \
+     }                                                                   \
+    return 0;                                                            \
 }
 
 /* set up copy functions for the basic C MPI data types */
diff --git a/opal/datatype/opal_copy_functions_heterogeneous.c b/opal/datatype/opal_copy_functions_heterogeneous.c
index 7a3e7d57f92..a44eb34324b 100644
--- a/opal/datatype/opal_copy_functions_heterogeneous.c
+++ b/opal/datatype/opal_copy_functions_heterogeneous.c
@@ -7,6 +7,7 @@
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -22,6 +23,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <strings.h>
 
 #include "opal/util/arch.h"
 
@@ -45,24 +47,61 @@
  */
 
 static inline void
-opal_dt_swap_bytes(void *to_p, const void *from_p, const size_t size, size_t count)
+opal_dt_swap_bytes(void *to_p, int to_size, int to_is_bigendian,
+    const void *from_p, int from_size, int from_is_bigendian,
+    size_t count)
 {
     size_t i;
-    size_t back_i = size - 1;
+    size_t back_i;
     uint8_t *to = (uint8_t*) to_p;
     uint8_t *from = (uint8_t*) from_p;
 
+/*
+ * Examples:
+ * (shortening 8b to 4b):
+ *   big to little endian : [-- -- -- --  00 00 00 01] -> [01 00 00 00]
+ *   little to big endian : [01 00 00 00  -- -- -- --] -> [00 00 00 01]
+ * (lengthening 4b to 8b):
+ *   big to little endian : [00 00 00 01] -> [01 00 00 00  00 00 00 00]
+ *   little to big endian : [01 00 00 00] -> [00 00 00 00  00 00 00 01]
+ */
+    unsigned int bytes_to_copy, from_offset, to_offset;
+    if (from_size > to_size) {
+        bytes_to_copy = to_size;
+        to_offset = 0;
+        if (from_is_bigendian) {
+            from_offset = from_size - to_size;
+        } else {
+            from_offset = 0;
+        }
+    } else if (to_size > from_size) {
+        bytes_to_copy = from_size;
+        from_offset = 0;
+        if (to_is_bigendian) {
+            to_offset = to_size - from_size;
+        } else {
+            to_offset = 0;
+        }
+        /* We could make a loop to fill the extra bytes on the hi/low end with 0 */
+        /* but I doubt it would be faster than just starting them all at 0 in */
+        /* the to_p buffer */
+        bzero(to_p, to_size * count);
+    } else {
+        bytes_to_copy = from_size;
+        from_offset = to_offset = 0;
+    }
+
     /* Do the first element */
-    for (i = 0 ; i < size ; i++, back_i--) {
-        to[back_i] = from[i];
+    for (i = 0, back_i = bytes_to_copy - 1 ; i < bytes_to_copy ; i++, back_i--) {
+        to[to_offset + back_i] = from[from_offset + i];
     }
     /* Do all the others if any */
     while(count > 1) {
-        to += size;
-        from += size;
+        to += to_size;
+        from += from_size;
         count--;
-        for (i = 0, back_i = size - 1 ; i < size ; i++, back_i--) {
-            to[back_i] = from[i];
+        for (i = 0, back_i = bytes_to_copy - 1 ; i < bytes_to_copy ; i++, back_i--) {
+            to[to_offset + back_i] = from[from_offset + i];
         }
     }
 }
@@ -141,97 +180,219 @@ opal_dt_swap_long_double(void *to_p, const void *from_p, const size_t size, size
 #define COPY_TYPE_HETEROGENEOUS( TYPENAME, TYPE )                                         \
             COPY_TYPE_HETEROGENEOUS_INTERNAL( TYPENAME, TYPE, 0 )
 
-#define COPY_TYPE_HETEROGENEOUS_INTERNAL( TYPENAME, TYPE, LONG_DOUBLE )                   \
-static int32_t                                                                            \
-copy_##TYPENAME##_heterogeneous(opal_convertor_t *pConvertor, size_t count,               \
-                                const char* from, size_t from_len, ptrdiff_t from_extent, \
-                                char* to, size_t to_length, ptrdiff_t to_extent,          \
-                                ptrdiff_t *advance)             \
-{                                                                       \
-    size_t i;                                                           \
-                                                                        \
-    datatype_check( #TYPE, sizeof(TYPE), sizeof(TYPE), &count,          \
-                   from, from_len, from_extent,                         \
-                   to, to_length, to_extent);                           \
-                                                                        \
-    if ((pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN) !=             \
-        (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) {                    \
-        if( (to_extent == from_extent) && (to_extent == sizeof(TYPE)) ) { \
-            opal_dt_swap_bytes(to, from, sizeof(TYPE), count);          \
-            if (LONG_DOUBLE) {                                          \
-                opal_dt_swap_long_double(to, from, sizeof(TYPE), count, pConvertor->remoteArch);\
-            }                                                           \
-        } else {                                                        \
-            for( i = 0; i < count; i++ ) {                              \
-                opal_dt_swap_bytes(to, from, sizeof(TYPE), 1);          \
-                if (LONG_DOUBLE) {                                      \
-                    opal_dt_swap_long_double(to, from, sizeof(TYPE), 1, pConvertor->remoteArch);\
-                }                                                       \
-                to += to_extent;                                        \
-                from += from_extent;                                    \
-            }                                                           \
-        }                                                               \
-    } else if ((ptrdiff_t)sizeof(TYPE) == to_extent &&          \
-               (ptrdiff_t)sizeof(TYPE) == from_extent) {        \
-         MEMCPY( to, from, count * sizeof(TYPE) );                      \
-    } else {                                                            \
-         /* source or destination are non-contigous */                  \
-         for( i = 0; i < count; i++ ) {                                 \
-             MEMCPY( to, from, sizeof(TYPE) );                          \
-             to += to_extent;                                           \
-             from += from_extent;                                       \
-         }                                                              \
-    }                                                                   \
-    *advance = count * from_extent;                                     \
-    return count;                                                       \
+#define COPY_TYPE_HETEROGENEOUS_INTERNAL( TYPENAME, TYPE, LONG_DOUBLE )  \
+static size_t                                                            \
+copy_##TYPENAME##_heterogeneous(opal_convertor_t *pConvertor,            \
+                                int mode,                                \
+                                ddt_elem_desc_t *elem,                   \
+                                size_t* pconv_ptr,                       \
+                                size_t* pcount_desc,                     \
+                                char** ppacked_buf,                      \
+                                size_t* ppacked_len,                     \
+                                size_t packed_element_size)              \
+{                                                                        \
+    vector_iterator_state_t vec;                                         \
+    size_t i;                                                            \
+    int opal_type = elem->common.type;                                   \
+    size_t vector_element_size = opal_datatype_basicDatatypes[opal_type]->size; \
+    vector_iter_load_current_state(&vec, elem, pconv_ptr, pcount_desc);  \
+    char **pfrom, **pto;                                                 \
+    size_t to_element_size, to_is_bigendian, from_element_size, from_is_bigendian; \
+    if (mode == COPY_TO_VECTOR) {                                        \
+        pto = &vec.buf,                                                  \
+        pfrom = ppacked_buf;                                             \
+        to_element_size = vector_element_size;                           \
+        to_is_bigendian = opal_local_arch & OPAL_ARCH_ISBIGENDIAN;       \
+        from_element_size = packed_element_size;                         \
+        from_is_bigendian = pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN; \
+    } else {                                                             \
+        pfrom = &vec.buf;                                                \
+        pto = ppacked_buf;                                               \
+        from_element_size = vector_element_size;                         \
+        from_is_bigendian = opal_local_arch & OPAL_ARCH_ISBIGENDIAN;     \
+        to_element_size = packed_element_size;                           \
+        to_is_bigendian = pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN; \
+    }                                                                    \
+                                                                         \
+    if (from_is_bigendian != to_is_bigendian) {                          \
+        while (*pcount_desc != 0 && *ppacked_len >= packed_element_size) { \
+            size_t mycount = vec.count;                                  \
+            if (mycount * packed_element_size > *ppacked_len) {          \
+                mycount = *ppacked_len / packed_element_size;            \
+            }                                                            \
+            opal_dt_swap_bytes(*pto, to_element_size, to_is_bigendian,   \
+                *pfrom, from_element_size, from_is_bigendian,            \
+                mycount);                                                \
+            if (LONG_DOUBLE) {                                           \
+                opal_dt_swap_long_double(*pto, *pfrom, to_element_size,  \
+                    mycount, pConvertor->remoteArch);                    \
+            }                                                            \
+            vector_iter_consume(&vec, mycount);                          \
+            *ppacked_buf += mycount * packed_element_size;               \
+            *ppacked_len -= mycount * packed_element_size;               \
+         }                                                               \
+    } else if (to_element_size != from_element_size) {                   \
+        /* For example a big endian machine converting to external32: */ \
+        /* same endian-ness but changing size, has to walk every element */ \
+        while (*pcount_desc != 0 && *ppacked_len >= packed_element_size) { \
+            size_t mycount = vec.count;                                  \
+            if (mycount * packed_element_size > *ppacked_len) {          \
+                mycount = *ppacked_len / packed_element_size;            \
+            }                                                            \
+            for (i = 0; i < mycount ; ++i) {                             \
+/* Examples:                                                               */ \
+/* (shortening 8b to 4b):                                                  */ \
+/*   big to big endian       : [-- -- -- --  00 00 00 01] -> [00 00 00 01] */ \
+/*   little to little endian : [01 00 00 00  -- -- -- --] -> [01 00 00 00] */ \
+/* (lengthening 4b to 8b):                                                 */ \
+/*   big to big endian       : [00 00 00 01] -> [00 00 00 00  00 00 00 01] */ \
+/*   little to little endian : [01 00 00 00] -> [01 00 00 00  00 00 00 00] */ \
+                if (from_element_size > to_element_size) {               \
+                    if (to_is_bigendian) {                               \
+                        MEMCPY(*pto, *pfrom + (from_element_size-to_element_size), \
+                            to_element_size);                            \
+                    } else {                                             \
+                        MEMCPY(*pto, *pfrom, to_element_size);           \
+                    }                                                    \
+                } else {                                                 \
+                    bzero(*pto, to_element_size);                        \
+                    if (to_is_bigendian) {                               \
+                        MEMCPY(*pto + (to_element_size-from_element_size), *pfrom , \
+                            from_element_size);                          \
+                    } else {                                             \
+                        MEMCPY(*pto, *pfrom , from_element_size);        \
+                    }                                                    \
+                }                                                        \
+                *pto += to_element_size;                                 \
+                *pfrom += from_element_size;                             \
+            }                                                            \
+            vector_iter_consume(&vec, mycount);                          \
+            *ppacked_buf += mycount * packed_element_size;               \
+            *ppacked_len -= mycount * packed_element_size;               \
+        }                                                                \
+    } else {                                                             \
+        while (*pcount_desc != 0 && *ppacked_len >= packed_element_size) { \
+            size_t mycount = vec.count;                                  \
+            if (mycount * packed_element_size > *ppacked_len) {          \
+                mycount = *ppacked_len / packed_element_size;            \
+            }                                                            \
+            MEMCPY(*pto, *pfrom, mycount * to_element_size );            \
+            vector_iter_consume(&vec, mycount);                          \
+            *ppacked_buf += mycount * packed_element_size;               \
+            *ppacked_len -= mycount * packed_element_size;               \
+         }                                                               \
+    }                                                                    \
+    return 0;                                                            \
 }
 
 #define COPY_2SAMETYPE_HETEROGENEOUS( TYPENAME, TYPE )                                         \
             COPY_2SAMETYPE_HETEROGENEOUS_INTERNAL( TYPENAME, TYPE, 0)
 
-#define COPY_2SAMETYPE_HETEROGENEOUS_INTERNAL( TYPENAME, TYPE, LONG_DOUBLE)                 \
-static int32_t                                                                            \
-copy_##TYPENAME##_heterogeneous(opal_convertor_t *pConvertor, size_t count,               \
-                                const char* from, size_t from_len, ptrdiff_t from_extent, \
-                                char* to, size_t to_length, ptrdiff_t to_extent,          \
-                                ptrdiff_t *advance)             \
-{                                                                       \
-    size_t i;                                                           \
-                                                                        \
-    datatype_check( #TYPE, sizeof(TYPE), sizeof(TYPE), &count,          \
-                   from, from_len, from_extent,                         \
-                   to, to_length, to_extent);                           \
-                                                                        \
-    if ((pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN) !=             \
-        (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) {                    \
-        if( (to_extent == from_extent) && (to_extent == (2 * sizeof(TYPE))) ) { \
-            opal_dt_swap_bytes(to, from, sizeof(TYPE), 2 * count);      \
-            if (LONG_DOUBLE) {                                          \
-                opal_dt_swap_long_double(to, from, sizeof(TYPE), 2*count, pConvertor->remoteArch);\
-            }                                                           \
-        } else {                                                        \
-            for( i = 0; i < count; i++ ) {                              \
-                opal_dt_swap_bytes(to, from, sizeof(TYPE), 2);          \
-                if (LONG_DOUBLE) {                                      \
-                    opal_dt_swap_long_double(to, from, sizeof(TYPE), 2, pConvertor->remoteArch);\
-                }                                                       \
-                to += to_extent;                                        \
-                from += from_extent;                                    \
-            }                                                           \
-        }                                                               \
-    } else if ((ptrdiff_t)sizeof(TYPE) == to_extent &&          \
-               (ptrdiff_t)sizeof(TYPE) == from_extent) {        \
-         MEMCPY( to, from, count * sizeof(TYPE) );                      \
-    } else {                                                            \
-         /* source or destination are non-contigous */                  \
-         for( i = 0; i < count; i++ ) {                                 \
-             MEMCPY( to, from, sizeof(TYPE) );                          \
-             to += to_extent;                                           \
-             from += from_extent;                                       \
-         }                                                              \
-    }                                                                   \
-    *advance = count * from_extent;                                     \
-    return count;                                                       \
+#define COPY_2SAMETYPE_HETEROGENEOUS_INTERNAL( TYPENAME, TYPE, LONG_DOUBLE) \
+static int32_t                                                           \
+copy_##TYPENAME##_heterogeneous(opal_convertor_t *pConvertor,            \
+                                int mode,                                \
+                                ddt_elem_desc_t *elem,                   \
+                                size_t* pconv_ptr,                       \
+                                size_t* pcount_desc,                     \
+                                char** ppacked_buf,                      \
+                                size_t* ppacked_len,                     \
+                                size_t packed_element_size)              \
+{                                                                        \
+    vector_iterator_state_t vec;                                         \
+    size_t i;                                                            \
+    int opal_type = elem->common.type;                                   \
+    size_t vector_element_size = opal_datatype_basicDatatypes[opal_type]->size; \
+    vector_iter_load_current_state(&vec, elem, pconv_ptr, pcount_desc);  \
+    char **pfrom, **pto;                                                 \
+    size_t to_element_size, to_is_bigendian, from_element_size, from_is_bigendian; \
+    if (mode == COPY_TO_VECTOR) {                                        \
+        pto = &vec.buf,                                                  \
+        pfrom = ppacked_buf;                                             \
+        to_element_size = vector_element_size;                           \
+        to_is_bigendian = opal_local_arch & OPAL_ARCH_ISBIGENDIAN;       \
+        from_element_size = packed_element_size;                         \
+        from_is_bigendian = pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN; \
+    } else {                                                             \
+        pfrom = &vec.buf;                                                \
+        pto = ppacked_buf;                                               \
+        from_element_size = vector_element_size;                         \
+        from_is_bigendian = opal_local_arch & OPAL_ARCH_ISBIGENDIAN;     \
+        to_element_size = packed_element_size;                           \
+        to_is_bigendian = pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN; \
+    }                                                                    \
+                                                                         \
+    if (from_is_bigendian != to_is_bigendian) {                          \
+        while (*pcount_desc != 0 && *ppacked_len >= packed_element_size) { \
+            size_t mycount = vec.count;                                  \
+            if (mycount * packed_element_size > *ppacked_len) {          \
+                mycount = *ppacked_len / packed_element_size;            \
+            }                                                            \
+/* an incoming MPI_DOUBLE_COMPLEX would have count=1 and *_element_size=16 */ \
+/* but we want to handle it as 2 elements of size 8 */                   \
+            opal_dt_swap_bytes(*pto, to_element_size/2, to_is_bigendian, \
+                *pfrom, from_element_size/2, from_is_bigendian,          \
+                mycount*2);                                              \
+            if (LONG_DOUBLE) {                                           \
+                opal_dt_swap_long_double(*pto, *pfrom, to_element_size/2, \
+                    mycount*2, pConvertor->remoteArch);                  \
+            }                                                            \
+            vector_iter_consume(&vec, mycount);                          \
+            *ppacked_buf += mycount * packed_element_size;               \
+            *ppacked_len -= mycount * packed_element_size;               \
+         }                                                               \
+    } else if (to_element_size != from_element_size) {                   \
+        /* For example a big endian machine converting to external32: */ \
+        /* same endian-ness but changing size, has to walk every element */ \
+        while (*pcount_desc != 0 && *ppacked_len >= packed_element_size) { \
+            size_t mycount = vec.count;                                  \
+            if (mycount * packed_element_size > *ppacked_len) {          \
+                mycount = *ppacked_len / packed_element_size;            \
+            }                                                            \
+            for (i = 0; i < mycount*2 ; ++i) {                           \
+/* Examples:                                                               */ \
+/* (shortening 8b to 4b):                                                  */ \
+/*   big to big endian       : [-- -- -- --  00 00 00 01] -> [00 00 00 01] */ \
+/*   little to little endian : [01 00 00 00  -- -- -- --] -> [01 00 00 00] */ \
+/* (lengthening 4b to 8b):                                                 */ \
+/*   big to big endian       : [00 00 00 01] -> [00 00 00 00  00 00 00 01] */ \
+/*   little to little endian : [01 00 00 00] -> [01 00 00 00  00 00 00 00] */ \
+                if (from_element_size > to_element_size) {               \
+                    if (to_is_bigendian) {                               \
+                        MEMCPY(*pto, *pfrom + (from_element_size/2-to_element_size/2), \
+                            to_element_size/2);                          \
+                    } else {                                             \
+                        MEMCPY(*pto, *pfrom, to_element_size/2);         \
+                    }                                                    \
+                } else {                                                 \
+                    bzero(*pto, to_element_size/2);                      \
+                    if (to_is_bigendian) {                               \
+                        MEMCPY(*pto + (to_element_size/2-from_element_size/2), *pfrom , \
+                            from_element_size/2);                          \
+                    } else {                                             \
+                        MEMCPY(*pto, *pfrom , from_element_size/2);      \
+                    }                                                    \
+                }                                                        \
+                *pto += to_element_size/2;                               \
+                *pfrom += from_element_size/2;                           \
+            }                                                            \
+            vector_iter_consume(&vec, mycount);                          \
+            *ppacked_buf += mycount * packed_element_size;               \
+            *ppacked_len -= mycount * packed_element_size;               \
+        }                                                                \
+    } else {                                                             \
+        while (*pcount_desc != 0 && *ppacked_len >= packed_element_size) { \
+            size_t mycount = vec.count;                                  \
+            if (mycount * packed_element_size > *ppacked_len) {          \
+                mycount = *ppacked_len / packed_element_size;            \
+            }                                                            \
+            MEMCPY(*pto, *pfrom, mycount * to_element_size );            \
+            vector_iter_consume(&vec, mycount);                          \
+            *ppacked_buf += mycount * packed_element_size;               \
+            *ppacked_len -= mycount * packed_element_size;               \
+         }                                                               \
+    }                                                                    \
+    return 0;                                                            \
 }
 
 #define COPY_2TYPE_HETEROGENEOUS( TYPENAME, TYPE1, TYPE2 )              \
diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h
index e60c586f0b1..97de8cde73d 100644
--- a/opal/datatype/opal_datatype_internal.h
+++ b/opal/datatype/opal_datatype_internal.h
@@ -29,6 +29,7 @@
 #define OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
+#include "opal_datatype.h"
 
 #include <stdarg.h>
 #include <string.h>
@@ -616,5 +617,108 @@ extern bool opal_ddt_unpack_debug;
 extern bool opal_ddt_pack_debug;
 extern bool opal_ddt_raw_debug;
 
+/*
+ * Vector iterating utilities to be used with conv_ptr and count_desc
+ *
+ * conv_ptr is managed the same way it appears to be done elsewhere:
+ * it is incremented as we walk the vector, but it is not offset by
+ * elem.disp.  So if a partially walked pElem were pushed onto the
+ * stack and restored later, the conv_ptr would come back as being
+ * iterated somewhere in the middle of the vector, but it would still
+ * need to be offset by elem.disp to locate actual data.
+ *
+ * The .buf field though is updated to be the actual location of the
+ * data, eg conv_ptr + elem.disp
+ */
+struct vector_iterator_state {
+    char *buf; /* next contiguous block of elements */
+    size_t count; /* length of next contiguous block of elements */
+    /*
+     * Other standard bookkeeping values are from outside
+     * this function, so they are used here by reference
+     *   conv_ptr : start of buffer for the next contig block of elements
+     *       (incremented as we iterate)
+     *   count_desc : total elements left in vector to iterate over
+     *       (decremented as we iterate)
+     */
+    unsigned char **pconv_ptr;
+    size_t *pcount_desc;
+
+    /* vector info */
+    ddt_elem_desc_t* elem;
+
+    /* internal book keeping */
+    size_t index_within_block;
+};
+typedef struct vector_iterator_state vector_iterator_state_t;
+
+#define COPY_TO_VECTOR   1
+#define COPY_FROM_VECTOR 2
+
+static inline void
+vector_iter_load_current_state(
+    vector_iterator_state_t *it,
+    ddt_elem_desc_t* elem,
+    unsigned char **pconv_ptr,
+    size_t *pcount_desc)
+{
+    size_t nblocks, blocklen, element_size, extent, elements_done;
+
+    it->elem = elem;
+    it->pconv_ptr = pconv_ptr;
+    it->pcount_desc = pcount_desc;
+
+    nblocks = it->elem->count;
+    blocklen = it->elem->blocklen;
+    extent = it->elem->extent;
+    element_size = opal_datatype_basicDatatypes[it->elem->common.type]->size;
+
+    it->index_within_block = 0;
+    elements_done = nblocks * blocklen - *it->pcount_desc;
+
+    if (elements_done != 0) {
+        it->index_within_block = elements_done % blocklen;
+    }
+
+    /*
+     * maintain .count as how many elements are left in the
+     * current contiguous block that conv_ptr is at
+     * and offset .buf by elem.disp to get the actual data
+     */
+    it->count = blocklen - it->index_within_block;
+    it->buf = *it->pconv_ptr + it->elem->disp;
+}
+
+static inline void
+vector_iter_consume(
+    vector_iterator_state_t *it,
+    size_t nelements)
+{
+    size_t nblocks, blocklen, element_size, extent;
+
+    nblocks = it->elem->count;
+    blocklen = it->elem->blocklen;
+    extent = it->elem->extent;
+    element_size = opal_datatype_basicDatatypes[it->elem->common.type]->size;
+
+    assert(nelements <= *it->pcount_desc);
+
+    /* roll back to the start of this block, then advance */
+    *it->pconv_ptr -= it->index_within_block * element_size;
+    *it->pcount_desc += it->index_within_block;
+    nelements += it->index_within_block;
+    it->index_within_block = 0;
+
+    size_t blocks_done, remainder_done;
+    blocks_done = nelements / blocklen;
+    remainder_done = nelements - blocks_done * blocklen;
+    *it->pconv_ptr += blocks_done * extent + remainder_done * element_size;
+    *it->pcount_desc -= nelements;
+    it->index_within_block = remainder_done;
+
+    it->count = blocklen - it->index_within_block;
+    it->buf = *it->pconv_ptr + it->elem->disp;
+}
+
 END_C_DECLS
 #endif  /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index b5225017a59..1b25b379dcd 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -14,6 +14,7 @@
  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2017-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -36,6 +37,7 @@
 #define DO_DEBUG(INST)
 #endif  /* OPAL_ENABLE_DEBUG */
 
+#include "opal/util/arch.h"
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
@@ -392,12 +394,15 @@ pack_predefined_heterogeneous( opal_convertor_t* CONVERTOR,
 {
     const opal_convertor_master_t* master = (CONVERTOR)->master;
     const ddt_elem_desc_t* _elem = &((ELEM)->elem);
-    unsigned char* _source = (*SOURCE) + _elem->disp;
+    unsigned char* _source = (*SOURCE); /* don't offset to _elem->disp here, let pFunction do that */
     ptrdiff_t advance;
     size_t _count = *(COUNT);
     size_t _r_blength;
 
     _r_blength = master->remote_sizes[_elem->common.type];
+    if (_elem->common.ompi_id != OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY && master->ompi_remote_sizes_is_set) {
+        _r_blength = master->ompi_remote_sizes[_elem->common.ompi_id];
+    }
     if( (_count * _r_blength) > *(SPACE) ) {
         _count = (*(SPACE) / _r_blength);
         if( 0 == _count ) return;  /* nothing to do */
@@ -410,15 +415,16 @@ pack_predefined_heterogeneous( opal_convertor_t* CONVERTOR,
                            ((ptrdiff_t)_r_blength == _elem->extent) ? "cont" : "----",
                            (void*)*(DESTINATION), (void*)_source, (unsigned long)_r_blength,
                            (unsigned long)(*(SPACE)) ); );
-    master->pFunctions[_elem->common.type]( CONVERTOR, _count,
-                                            _source, *SPACE, _elem->extent,
-                                            *DESTINATION, *SPACE, _r_blength,
-                                            &advance );
-    _r_blength     *= _count;  /* update the remote length to encompass all the elements */
-    *(SOURCE)      += _count * _elem->extent;
-    *(DESTINATION) += _r_blength;
-    *(SPACE)       -= _r_blength;
-    *(COUNT)       -= _count;
+    size_t elements_done = 0;
+    size_t rc;
+    rc = master->pFunctions[_elem->common.type]( CONVERTOR,
+                                            COPY_FROM_VECTOR,
+                                            _elem,
+                                            SOURCE,
+                                            COUNT,
+                                            DESTINATION,
+                                            SPACE,
+                                            _r_blength);
 }
 
 int32_t
@@ -550,7 +556,7 @@ opal_pack_general_function( opal_convertor_t* pConvertor,
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
-    if( pConvertor->bConverted == pConvertor->local_size ) {
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         return 1;
     }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 6f9fdce2774..b49fb0cc3f9 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -15,6 +15,7 @@
  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2017-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -38,6 +39,7 @@
 #define DO_DEBUG(INST)
 #endif  /* OPAL_ENABLE_DEBUG */
 
+#include "opal/util/arch.h"
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_unpack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
@@ -484,14 +486,24 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
                                        (void*)pConvertor->pBaseBuf, conv_ptr + pElem->elem.disp - pConvertor->pBaseBuf,
                                        count_desc, description[pos_desc].elem.extent,
                                        opal_datatype_basicDatatypes[type]->name ); );
-                rc = master->pFunctions[type]( pConvertor, count_desc,
-                                               iov_ptr, iov_len_local, opal_datatype_basicDatatypes[type]->size,
-                                               conv_ptr + pElem->elem.disp,
-                                               (pConvertor->pDesc->ub - pConvertor->pDesc->lb) * pConvertor->count,
-                                               description[pos_desc].elem.extent, &advance );
-                iov_len_local -= advance;  /* decrease the available space in the buffer */
-                iov_ptr += advance;        /* increase the pointer to the buffer */
-                count_desc -= rc;          /* compute leftovers */
+                size_t _r_blength = master->remote_sizes[type];
+                if (pElem->elem.common.ompi_id != OPAL_MIRROR_OMPI_DATATYPE_MPI_EMPTY && master->ompi_remote_sizes_is_set) {
+                    _r_blength = master->ompi_remote_sizes[pElem->elem.common.ompi_id];
+                }
+/*
+ *  pElem describes a vector being iterated over, with conv_ptr stepping
+ *  through, and count_desc decrementing for what's already completed.
+ */
+                rc = master->pFunctions[type]( pConvertor,
+                                               COPY_TO_VECTOR,
+                                               &pElem->elem,
+                                               &conv_ptr,
+                                               &count_desc,
+                                               &iov_ptr,
+                                               &iov_len_local,
+                                               _r_blength);
+                /* iov_len_local/iov_ptr were incr/decr in pFunction,
+                 * along with conv_ptr/count_desc */
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                     pos_desc++;  /* advance to the next data */
@@ -499,7 +511,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
                     if( 0 == iov_len_local ) goto complete_loop;  /* escape if we're done */
                     continue;
                 }
-                conv_ptr += rc * description[pos_desc].elem.extent;
                 assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
                 assert( 0 == iov_len_local );
                 if( 0 != iov_len_local ) {
diff --git a/opal/util/arch.h b/opal/util/arch.h
index a1a0446e1f3..d1c61d575d8 100644
--- a/opal/util/arch.h
+++ b/opal/util/arch.h
@@ -11,6 +11,7 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -202,7 +203,7 @@
 
 /* BYTE 2 */
 #define OPAL_ARCH_LONGISxx        0x0000c000  /* mask for sizeof long */
-#define OPAL_ARCH_LONGIS64        0x00001000
+#define OPAL_ARCH_LONGIS64        0x00008000
 #define OPAL_ARCH_LONGLONGISxx    0x00003000  /* mask for sizeof long long */
 
 #define OPAL_ARCH_BOOLISxx        0x00000c00  /* mask for sizeof bool */

From f83d47f9fa38a4c6f3af2209b50b6ad5d9578ed6 Mon Sep 17 00:00:00 2001
From: Mark Allen <markalle@us.ibm.com>
Date: Wed, 14 Oct 2020 01:29:25 -0400
Subject: [PATCH 3/3] updating ompi_op_ddt_map[] for more OMPI types

The OMPI types used to have more items #defined to the same values,
so OMPI_DATATYPE_MPI_INT was one of OMPI_DATATYPE_MPI_INT16_T or
OMPI_DATATYPE_MPI_INT32_T etc.  The reduction code uses some
map that's expected to have a value for each OMPI_DATATYPE_*
so it neede more entries.

Signed-off-by: Mark Allen <markalle@us.ibm.com>
---
 ompi/op/op.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/ompi/op/op.c b/ompi/op/op.c
index 39732a40187..4fdacd6f564 100644
--- a/ompi/op/op.c
+++ b/ompi/op/op.c
@@ -17,6 +17,7 @@
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -228,6 +229,72 @@ int ompi_op_init(void)
     ompi_op_ddt_map[OMPI_DATATYPE_MPI_SHORT_FLOAT] = OMPI_OP_BASE_TYPE_SHORT_FLOAT;
     ompi_op_ddt_map[OMPI_DATATYPE_MPI_C_SHORT_FLOAT_COMPLEX] = OMPI_OP_BASE_TYPE_C_SHORT_FLOAT_COMPLEX;
 
+/*
+ *  The list of OMPI types used to end here.  And the other entries were
+ *  mapped to one of the above, eg
+ *    OMPI_DATATYPE_MPI_INT was #defined as one of
+ *    OMPI_DATATYPE_MPI_INT16_T or OMPI_DATATYPE_MPI_INT32_T etc
+ *    depending on the size of an int.
+ *  The external32 fix involves more separation of OMPI datatypes.
+ *
+ *  We could have a lot of sizeof checks like
+ *    #if SIZEOF_INT == 4
+ *    ompi_op_ddt_map[OMPI_DATATYPE_MPI_INT] = ompi_op_ddt_map[OMPI_DATATYPE_MPI_INT32_T]
+ *    ...
+ *  but that data is indirectly available already in the .id field.  Both
+ *  of those OMPI types have the same OPAL value (6 in this example) for .id.
+ */
+#define FIND_MATCH(OMPIDT) do {                                                       \
+        int opal_id_search;                                                           \
+        int ompi_id_i;                                                                \
+        int opal_id_i;                                                                \
+        opal_id_search = ompi_datatype_basicDatatypes[OMPIDT]->super.id;              \
+        for (ompi_id_i = 0; ompi_id_i < OMPI_DATATYPE_MAX_PREDEFINED; ++ompi_id_i) {  \
+            opal_id_i = ompi_datatype_basicDatatypes[ompi_id_i]->super.id;            \
+            if (opal_id_i == opal_id_search && ompi_op_ddt_map[ompi_id_i] != -1) {    \
+                ompi_op_ddt_map[OMPIDT] = ompi_op_ddt_map[ompi_id_i];                 \
+                break;                                                                \
+            }                                                                         \
+        }                                                                             \
+        if (ompi_op_ddt_map[OMPIDT] == -1) {                                          \
+            if (!(ompi_datatype_basicDatatypes[OMPIDT]->super.flags &                 \
+                OPAL_DATATYPE_FLAG_UNAVAILABLE))                                      \
+            {                                                                         \
+                return OMPI_ERROR;                                                    \
+            }                                                                         \
+        }                                                                             \
+    } while (0)
+    FIND_MATCH(OMPI_DATATYPE_MPI_CHAR);
+    FIND_MATCH(OMPI_DATATYPE_MPI_SIGNED_CHAR);
+    FIND_MATCH(OMPI_DATATYPE_MPI_UNSIGNED_CHAR);
+    FIND_MATCH(OMPI_DATATYPE_MPI_BYTE);
+    FIND_MATCH(OMPI_DATATYPE_MPI_SHORT);
+    FIND_MATCH(OMPI_DATATYPE_MPI_UNSIGNED_SHORT);
+    FIND_MATCH(OMPI_DATATYPE_MPI_INT);
+    FIND_MATCH(OMPI_DATATYPE_MPI_UNSIGNED);
+    FIND_MATCH(OMPI_DATATYPE_MPI_LONG);
+    FIND_MATCH(OMPI_DATATYPE_MPI_UNSIGNED_LONG);
+    FIND_MATCH(OMPI_DATATYPE_MPI_LONG_LONG_INT);
+    FIND_MATCH(OMPI_DATATYPE_MPI_UNSIGNED_LONG_LONG);
+    FIND_MATCH(OMPI_DATATYPE_MPI_LOGICAL1);
+    FIND_MATCH(OMPI_DATATYPE_MPI_LOGICAL2);
+    FIND_MATCH(OMPI_DATATYPE_MPI_LOGICAL4);
+    FIND_MATCH(OMPI_DATATYPE_MPI_LOGICAL8);
+    FIND_MATCH(OMPI_DATATYPE_MPI_INTEGER1);
+    FIND_MATCH(OMPI_DATATYPE_MPI_INTEGER2);
+    FIND_MATCH(OMPI_DATATYPE_MPI_INTEGER4);
+    FIND_MATCH(OMPI_DATATYPE_MPI_INTEGER8);
+    FIND_MATCH(OMPI_DATATYPE_MPI_INTEGER16);
+    FIND_MATCH(OMPI_DATATYPE_MPI_REAL2);
+    FIND_MATCH(OMPI_DATATYPE_MPI_REAL4);
+    FIND_MATCH(OMPI_DATATYPE_MPI_REAL8);
+    FIND_MATCH(OMPI_DATATYPE_MPI_REAL16);
+    FIND_MATCH(OMPI_DATATYPE_MPI_CXX_BOOL);
+    FIND_MATCH(OMPI_DATATYPE_MPI_CXX_SHORT_FLOAT_COMPLEX);
+    FIND_MATCH(OMPI_DATATYPE_MPI_CXX_FLOAT_COMPLEX);
+    FIND_MATCH(OMPI_DATATYPE_MPI_CXX_DOUBLE_COMPLEX);
+    FIND_MATCH(OMPI_DATATYPE_MPI_CXX_LONG_DOUBLE_COMPLEX);
+
     /* Create the intrinsic ops */
 
     if (OMPI_SUCCESS !=