open-mpi
diff --git a/‎ompi/datatype/ompi_datatype_external32.c
Lines changed: 3 additions & 1 deletion b/‎ompi/datatype/ompi_datatype_external32.c
Lines changed: 3 additions & 1 deletion
diff --git a/‎ompi/datatype/ompi_datatype_internal.h
Lines changed: 3 additions & 10 deletions b/‎ompi/datatype/ompi_datatype_internal.h
Lines changed: 3 additions & 10 deletions
diff --git a/‎ompi/mca/coll/hcoll/coll_hcoll_dtypes.h
Lines changed: 13 additions & 1 deletion b/‎ompi/mca/coll/hcoll/coll_hcoll_dtypes.h
Lines changed: 13 additions & 1 deletion
diff --git a/‎opal/datatype/opal_convertor.c
Lines changed: 24 additions & 1 deletion b/‎opal/datatype/opal_convertor.c
Lines changed: 24 additions & 1 deletion
diff --git a/‎opal/datatype/opal_convertor_internal.h
Lines changed: 233 additions & 4 deletions b/‎opal/datatype/opal_convertor_internal.h
Lines changed: 233 additions & 4 deletions
@@ -11,6 +11,7 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -71,7 +72,8 @@
 uint32_t ompi_datatype_external32_arch_id = OPAL_ARCH_LDEXPSIZEIS15 | OPAL_ARCH_LDMANTDIGIS113 |
                                             OPAL_ARCH_LONGDOUBLEIS128 | OPAL_ARCH_ISBIGENDIAN |
                                             OPAL_ARCH_HEADERMASK | OPAL_ARCH_HEADERMASK2 |
-                                            OPAL_ARCH_BOOLIS8 | OPAL_ARCH_LOGICALIS8;
+                                            OPAL_ARCH_BOOLIS8 | OPAL_ARCH_LOGICALIS8 |
+                                            OPAL_ARCH_LONGIS32;
 
 opal_convertor_t* ompi_mpi_external32_convertor = NULL;
 opal_convertor_t* ompi_mpi_local_convertor = NULL;
 
@@ -10,6 +10,7 @@
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2016-2018 FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -570,16 +571,8 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX
 #define OMPI_DATATYPE_INITIALIZER_UNSIGNED            OPAL_DATATYPE_INITIALIZER_UINT8
 #endif
 
-#if SIZEOF_LONG == 4
-#define OMPI_DATATYPE_INITIALIZER_LONG                OPAL_DATATYPE_INITIALIZER_INT4
-#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG       OPAL_DATATYPE_INITIALIZER_UINT4
-#elif SIZEOF_LONG == 8
-#define OMPI_DATATYPE_INITIALIZER_LONG                OPAL_DATATYPE_INITIALIZER_INT8
-#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG       OPAL_DATATYPE_INITIALIZER_UINT8
-#elif SIZEOF_LONG == 16
-#define OMPI_DATATYPE_INITIALIZER_LONG                OPAL_DATATYPE_INITIALIZER_INT16
-#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG       OPAL_DATATYPE_INITIALIZER_UINT16
-#endif
+#define OMPI_DATATYPE_INITIALIZER_LONG                OPAL_DATATYPE_INITIALIZER_LONG
+#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_LONG       OPAL_DATATYPE_INITIALIZER_ULONG
 
 #if SIZEOF_LONG_LONG == 4
 #define OMPI_DATATYPE_INITIALIZER_LONG_LONG_INT       OPAL_DATATYPE_INITIALIZER_INT4
 
@@ -92,7 +92,19 @@ static dte_data_representation_t* ompi_datatype_2_dte_data_rep[OMPI_DATATYPE_MAX
 #else
     &DTE_ZERO,
 #endif
-    &DTE_ZERO                   /*OPAL_DATATYPE_UNAVAILABLE    25 */
+
+#if SIZEOF_LONG == 4
+    &DTE_INT32,                 /*OPAL_DATATYPE_LONG           25 */
+    &DTE_UINT32,                /*OPAL_DATATYPE_ULONG          26 */
+#elif SIZEOF_LONG == 8
+    &DTE_INT64,                 /*OPAL_DATATYPE_LONG           25 */
+    &DTE_UINT64,                /*OPAL_DATATYPE_ULONG          26 */
+#elif SIZEOF_LONG == 16
+    &DTE_INT128,                /*OPAL_DATATYPE_LONG           25 */
+    &DTE_UINT128,               /*OPAL_DATATYPE_ULONG          26 */
+#endif
+
+    &DTE_ZERO                   /*OPAL_DATATYPE_UNAVAILABLE    27 */
 };
 
 enum {
 
@@ -15,6 +15,7 @@
  * Copyright (c) 2013-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2017      Intel, Inc. All rights reserved
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -141,6 +142,12 @@ opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_a
         opal_output( 0, "Unknown sizeof(bool) for the remote architecture\n" );
     }
 
+    /* Same for long */
+    if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_LONGIS32 ) ) {
+        remote_sizes[OPAL_DATATYPE_LONG] = 4;
+        remote_sizes[OPAL_DATATYPE_ULONG] = 4;
+    }
+
     /**
      * Now we can compute the conversion mask. For all sizes where the remote
      * and local architecture differ a conversion is needed. Moreover, if the
@@ -482,8 +489,24 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
 
     pConvertor->remote_size = pConvertor->local_size;
     if( OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask) ) {
+        int is_send_conversion = 0;
+        if (pConvertor->flags & CONVERTOR_SEND_CONVERSION) {
+            // Adding to the conditions for keeping the optimized description.
+            // Now it's only optimized if (send && contiguous &&
+            // !something like external32 that needs conversion)
+            //
+            // Note, elsewhere there are similar checks that boil down to
+            // checking that CONVERTOR_SEND_CONVERSION is on but that
+            // HOMOGENEOUS is off.  That kind of makes sense, except
+            // OPAL_CONVERTOR_PREPARE seems to universally set HOMOGENEOUS
+            // so I don't think that setting means what it looks like it
+            // means, so I'm not using it.
+            is_send_conversion = 1;
+        }
         pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS);
-        if (!(pConvertor->flags & CONVERTOR_SEND && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS)) {
+        if (!(pConvertor->flags & CONVERTOR_SEND && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS
+            && !is_send_conversion))
+        {
             pConvertor->use_desc = &(datatype->desc);
         }
         if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
 
@@ -7,6 +7,7 @@
  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2020      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,13 +20,241 @@
 #include "opal_config.h"
 
 #include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_internal.h"
 
 BEGIN_C_DECLS
 
-typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
-                                     const void* from, size_t from_len, ptrdiff_t from_extent,
-                                     void* to, size_t to_length, ptrdiff_t to_extent,
-                                     ptrdiff_t *advance );
+#define COPY_TO_VECTOR   1
+#define COPY_FROM_VECTOR 2
+// returns elements processed in the packed_buf for this call
+typedef size_t (*conversion_fct_t)( opal_convertor_t* pConvertor, int mode,
+                                     char* vector_buf,
+                                     const ddt_elem_desc_t *vector_info,
+                                     char* packed_buf,
+                                     size_t packed_len,
+                                     size_t packed_element_size,
+                                     size_t *total_elements_done );
+
+/*
+ *  The VECTOR_* macros are made to help iterate a buffer pointer over
+ *  data that resembles an MPI_Type_vector, ie it has a number of
+ *  blocks, a count_per_block, an extent_between_blocks, and an element
+ *  size.
+ *
+ *  For example if a buffer's data were
+ *      [INT4,INT4,----,INT4,INT4,----,INT4,INT4]
+ *  the vars describing the vector would be
+ *      nblocks = 3
+ *      count_per_block = 2
+ *      extent_between_blocks = 12
+ *      element_size = 4
+ *
+ *  The macros also have inputs for stopping part of the way through
+ *  a vector, and for picking back up in the middle of the vector.
+ *  The granularity for that is an "element" though, so you could
+ *  stop after writing 3 INT4 in the above example, then pick
+ *  back up at the next INT4, but you can't stop in the middle of
+ *  an INT4.
+ *
+ *  Interface description
+ *      VECTOR_INIT():
+ *          arguments:
+ *          buf                   : starting point of a buffer whose
+ *                                : data is a vector, like
+ *                                : [INT4,INT4,----,INT4,INT4,----,INT4,INT4]
+ *          nblocks               : vector description for number of
+ *                                : contiguous chunks, in this example 3
+ *          count_per_block       : vector description for number of elements
+ *                                : per chunk, in this example 2
+ *          extent_between_blocks : vector description for bytes between
+ *                                : blocks, in this example 12
+ *          element_size          : vector description for element size,
+ *                                : in this example 4 for size of the INT4
+ *          elements_already_done : allow iteration to pick up in the middle
+ *          max_elements          : specify an external factor limiting the
+ *                                : number of vector elements to itearte over
+ *                                : (beyond the current elements_alrady_done).
+ *                                : The expected use case for this is that
+ *                                : we're iterating over two buffers: one
+ *                                : a vector, the other a packed buf, and
+ *                                : we're probably copying data from one to
+ *                                : the other.  So "max_elements" could be
+ *                                : given as the number of elements the
+ *                                : packed buffers has, so that constraint
+ *                                : will then be included in deciding how
+ *                                : many elements are to be iterated over
+ *                                : in the vector.  If no external factor like
+ *                                : that is relevant, then using max_elements
+ *                                : = (nblocks * count_per_block) would make
+ *                                : this argument a no-op.  Note that this
+ *                                : arg is relative to elements_already_done,
+ *                                : so if you've already done 3, and you have
+ *                                : room to iterate 3 more elements, you'd
+ *                                : specify max_elements=3
+ *      VECTOR_GET_NEXT_CONTIG_BLOCK():
+ *          updates vec.buf and vec.len
+ *      VECTOR_GET_NEXT_CONTIG_BLOCK_LIMITED_TO(max):
+ *          updates vec.buf and vec.len but only gives a vec.len
+ *          of at most max
+ *      VECTOR_UPDATE():
+ *      vector_iteration_state_t vec public fields;
+ *          char *buf            : where to write next data
+ *          size_t len           : length of next contiguous block of elements
+ *          size_t elements_done : num elements (total) iterated over
+ *                               : (so if you initialize with done=3 then
+ *                               : iterate over 2 then iterate over 3 more,
+ *                               : done=8)
+ *          size_t max_elements  : num elements till done iterating the vector
+ *
+ *  Documentation by example:
+ *
+ *      vector_iteration_state_t vec;
+ *      // [INT4,INT4,....,INT4,INT4,....,INT4,INT4]
+ *      int buf[] = {1, 2, -1, 3, 4, -1, 5, 6};
+ *      int i;
+ *      VECTOR_INIT(buf,
+ *          3, 2, 12, 4, // vector description
+ *          0, 6);       // start at the beginning, no external count limitation
+ *      printf("num elements in vector: %d\n", vec.max_elements);
+ *      while (vec.elements_done < vec.max_elements) {
+ *          VECTOR_GET_NEXT_CONTIG_BLOCK();
+ *          printf("(buf offset %d) contig count %d\n",
+ *              (int)(vec.buf - (char*)buf), vec.len);
+ *          for (i=0; i<vec.len; ++i) {
+ *              printf("    vec.buf[%d] = %d\n", i, ((int*)vec.buf)[i]);
+ *          }
+ *          VECTOR_UPDATE();
+ *          printf("    total iterated over so far: %d\n", vec.elements_done);
+ *      }
+ *
+ * > num elements in vector: 6
+ * > (buf offset 0) contig count 2
+ * >     vec.buf[0] = 1
+ * >     vec.buf[1] = 2
+ * >     total iterated over so far: 2
+ * > (buf offset 12) contig count 2
+ * >     vec.buf[0] = 3
+ * >     vec.buf[1] = 4
+ * >     total iterated over so far: 4
+ * > (buf offset 24) contig count 2
+ * >     vec.buf[0] = 5
+ * >     vec.buf[1] = 6
+ * >     total iterated over so far: 6
+ *
+ *      VECTOR_INIT(buf,
+ *          3, 2, 12, 4,
+ *          3, 2); // This usage might represent walking the vector
+ *                 // piecemeal putting it into a packed buffer of
+ *                 // limited size.  So the 3 indicates what has
+ *                 // already been iterated over in the past and
+ *                 // 2 is the limit for the current packed buf
+ *      printf("num elements in vector: %d, starting at %d\n",
+ *          vec.max_elements, vec.elements_done);
+ *      while (vec.elements_done < vec.max_elements) {
+ *          VECTOR_GET_NEXT_CONTIG_BLOCK();
+ *          printf("(buf offset %d) contig count %d\n",
+ *              (int)(vec.buf - (char*)buf), vec.len);
+ *          for (i=0; i<vec.len; ++i) {
+ *              printf("    vec.buf[%d] = %d\n", i, ((int*)vec.buf)[i]);
+ *          }
+ *          VECTOR_UPDATE();
+ *          printf("    total iterated over so far: %d\n", vec.elements_done);
+ *      }
+ *
+ * > num elements in vector: 5, starting at 3
+ * > (buf offset 16) contig count 1
+ * >     vec.buf[0] = 4
+ * >     total iterated over so far: 4
+ * > (buf offset 24) contig count 1
+ * >     vec.buf[0] = 5
+ * >     total iterated over so far: 5
+ *
+ */
+
+typedef struct {
+    // fields intended to be used by the user:
+    char *buf;              // where to write next data
+    size_t len;             // length of next contiguous block of elements
+    size_t elements_done;   // num elements (total) iterated over
+           // (so if you initialize with done=3 then iterate over 2
+           // then iterate over 3 more, done=8)
+    size_t max_elements;   // num elements till done iterating the vector
+    // internal:
+    size_t count_per_block;
+    size_t extent_between_blocks;
+    size_t element_size;
+    size_t i;
+    size_t j;
+} vector_iteration_state_t;
+
+#define VECTOR_INIT(buf_,                                                  \
+                    nblocks_,                                              \
+                    count_per_block_,                                      \
+                    extent_between_blocks_,                                \
+                    element_size_,                                         \
+                    elements_already_done_,                                \
+                    max_elements_)                                         \
+do {                                                                       \
+    vec.buf = (char*)(buf_);                                               \
+    vec.count_per_block = (count_per_block_);                              \
+    vec.extent_between_blocks = (extent_between_blocks_);                  \
+    vec.element_size = (element_size_);                                    \
+    vec.i = 0;                                                             \
+    vec.j = 0;                                                             \
+    vec.elements_done = (elements_already_done_);                          \
+                                                                           \
+    /* vec.max_elements is the min of two factors: */                      \
+    /* 1. nblocks * count_per_block */                                     \
+    /* 2. what is specified as max_elements to the macro */                \
+    /* for computing what will fit in vec.buf base on its len: */          \
+    /* number of complete blocks : vector_len / extent_between_blocks */   \
+    /* bytes left for the next block : vector_len % extent_between_blocks */ \
+    vec.max_elements = (nblocks_) * (count_per_block_);                    \
+    if (vec.elements_done + (max_elements_) < vec.max_elements) {          \
+        vec.max_elements = vec.elements_done + (max_elements_);            \
+    }                                                                      \
+                                                                           \
+    if ((elements_already_done_) != 0) {                                   \
+        vec.i = (elements_already_done_) / (count_per_block_);             \
+        vec.j = (elements_already_done_) % (count_per_block_);             \
+        vec.buf += (vec.i * (extent_between_blocks_) + vec.j * (element_size_)); \
+    }                                                                      \
+} while (0);
+
+#define VECTOR_GET_NEXT_CONTIG_BLOCK()                                     \
+do {                                                                       \
+    vec.len = vec.count_per_block - vec.j;                                 \
+    if (vec.elements_done + vec.len > vec.max_elements) {                  \
+        vec.len = vec.max_elements - vec.elements_done;                    \
+    }                                                                      \
+} while (0);
+
+#define VECTOR_GET_NEXT_CONTIG_BLOCK_LIMITED_TO(max_)                      \
+do {                                                                       \
+    vec.len = vec.count_per_block - vec.j;                                 \
+    if (vec.elements_done + vec.len > vec.max_elements) {                  \
+        vec.len = vec.max_elements - vec.elements_done;                    \
+    }                                                                      \
+    if (vec.len > (max_)) {                                                \
+        vec.len = (max_);                                                  \
+    }                                                                      \
+} while (0);
+
+#define VECTOR_UPDATE()                                                    \
+do {                                                                       \
+    if (vec.len == vec.count_per_block) {                                  \
+        vec.buf += vec.extent_between_blocks;                              \
+    } else {                                                               \
+        vec.j += vec.len;                                                  \
+        vec.buf += (vec.len * vec.element_size);                           \
+        if (vec.j == vec.count_per_block) {                                \
+            vec.j = 0;                                                     \
+            vec.buf -= (vec.count_per_block * vec.element_size);           \
+            vec.buf += vec.extent_between_blocks;                          \
+        }                                                                  \
+    }                                                                      \
+    vec.elements_done += vec.len;                                          \
+} while (0);
 
 typedef struct opal_convertor_master_t {
     struct opal_convertor_master_t* next;