From 30639166f60606454afb2d7b23e29f9523487c31 Mon Sep 17 00:00:00 2001
From: Austen Lauria <awlauria@us.ibm.com>
Date: Tue, 14 Jul 2020 16:02:12 -0400
Subject: [PATCH] Improve predefined pack/unpack performance using mpich
 methods.

For the original mpich implementation, see:
https://github.com/pmodels/mpich/blob/9ab5fd06af2a648bf24214f0d9cff0ee77ee3e7d/src/mpi/datatype/veccpy.h

Small testcase to demonstrate the performance difference:
https://gist.github.com/markalle/9f92e9facbd71136bcfb9f0e0305a1da
% mpicc -o x packperf_nc.c
% mpirun -np 1 ./x

Before:
> pack dtbig    :    943    863    863    862    862  (avg 879)  usec
> unpack dtbig  :    919    955    917    917    917  (avg 925)  usec
> pack dtsmall  :    810    810    810    831    810  (avg 814)  usec
> unpack dtsmall:    947    954    996    941    969  (avg 962)  usec
After:
> pack dtbig    :    205    124    120    118    118  (avg 137)  usec
> unpack dtbig  :    122    120    120    120    120  (avg 120)  usec
> pack dtsmall  :    133    122    122    121    121  (avg 124)  usec
> unpack dtsmall:    124    124    123    123    123  (avg 123)  usec

Having lots of small memcpy() was slower than the mpich code that
uses blocks of array assignments.

Notes about what changed:
* at the top-level mpi/c/pack.c and unpack.c it now sometimes
  turns (count, dtype) into (1, newdtype) with an newdtype made
  by MPI_Type_contiguous(count, dtype).  This is because the lower
  level pack/unpack always iterates over description elements and
  when it sees (count,dtype) there's no possibility of a single
  description element describing the whole data.

  I'm triggering that code only when the count is >=250 and
  the type is non-contiguous.  It likely only needs to be triggered
  if the datatype has a single element such that the
  element.count * element.extent == dtype.extent but that would
  be more code to detect.
* in Datatype_internal.h I moved the macros around a little so
  I could reuse them in the new unrolled array assignments code.
  That way I don't have to figure out that INT4 is int32_t, because
  those macros already have that info.  The diff probably looks
  large but there isn't that much going on there.
* in opal_datatype_pack/unpack.h there's an extra section to call
  the mpich vector copying code for a description element if
  it's a certain size, and continue with the regular code if
  the mpich call rejects it (due to not recognizing the element.id,
  or due to it being cuda memory, or due to alignment)
* the new opal_datatype_pack_unpack_predefined.h largely copied
  from mpich.  The macros boil down to unrolled array assignments.
  I recycled the opal_datatype_internal.h macros to get the values
  for TYPE.  That way I don't have to figure out whether
  SHORT_FLOAT_COMPELX is short float _Complex or opal_short_float_complex_t
  or unavailable for example.

Extra notes about the new pack/unpack routine:
* For checking cuda memory I didn't check every item in the vector,
  only the first and possibly the last, since I don't think individual
  description elements should be spanning gpu and system memory.
* I didn't use the unaligned-stride code from mpich, instead just
  rejecting anything unaligned

Licensing:
https://github.com/pmodels/mpich/blob/9ab5fd06af2a648bf24214f0d9cff0ee77ee3e7d/src/mpi/datatype/veccpy.h
where the code came from says
> /*
>  *  (C) 2001 by Argonne National Laboratory.
>  *      See COPYRIGHT in top-level directory.
>  */
And I pasted the above mentioned COPYRIGHT at the top of
opal_datatype_pack_unpack_predefined.h

Signed-off-by: Mark Allen <markalle@us.ibm.com>
---
 LICENSE                                       |  41 ++
 ompi/datatype/ompi_datatype.h                 |  90 +++
 ompi/mpi/c/pack.c                             |  23 +-
 ompi/mpi/c/unpack.c                           |  23 +-
 opal/datatype/Makefile.am                     |   2 +
 opal/datatype/opal_datatype_internal.h        | 189 +++++--
 opal/datatype/opal_datatype_pack.h            |  11 +-
 .../opal_datatype_pack_unpack_predefined.h    | 532 ++++++++++++++++++
 opal/datatype/opal_datatype_unpack.h          |   9 +
 9 files changed, 864 insertions(+), 56 deletions(-)
 create mode 100644 opal/datatype/opal_datatype_pack_unpack_predefined.h

diff --git a/LICENSE b/LICENSE
index 906630dcc6..4ed90b0f87 100644
--- a/LICENSE
+++ b/LICENSE
@@ -58,6 +58,8 @@ Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates.  All Rights
 Copyright (c) 2018      DataDirect Networks. All rights reserved.
 Copyright (c) 2018-2020 Triad National Security, LLC. All rights reserved.
 Copyright (c) 2020      Google, LLC. All rights reserved.
+Copyright (c) 2002      University of Chicago
+Copyright (c) 2001      Argonne National Laboratory
 
 $COPYRIGHT$
 
@@ -99,3 +101,42 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------[Copyright from inclusion of MPICH code]----------------
+
+The following is a notice of limited availability of the code, and disclaimer
+which must be included in the prologue of the code and in all source listings
+of the code.
+
+Copyright Notice
+ + 2002 University of Chicago
+
+Permission is hereby granted to use, reproduce, prepare derivative works, and
+to redistribute to others.  This software was authored by:
+
+Mathematics and Computer Science Division
+Argonne National Laboratory, Argonne IL 60439
+
+(and)
+
+Department of Computer Science
+University of Illinois at Urbana-Champaign
+
+
+			      GOVERNMENT LICENSE
+
+Portions of this material resulted from work developed under a U.S.
+Government Contract and are subject to the following license: the Government
+is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+irrevocable worldwide license in this computer software to reproduce, prepare
+derivative works, and perform publicly and display publicly.
+
+				  DISCLAIMER
+
+This computer code material was prepared, in part, as an account of work
+sponsored by an agency of the United States Government.  Neither the United
+States, nor the University of Chicago, nor any of their employees, makes any
+warranty express or implied, or assumes any legal liability or responsibility
+for the accuracy, completeness, or usefulness of any information, apparatus,
+product, or process disclosed, or represents that its use would not infringe
+privately owned rights.
diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 9d91b88a5d..e2ee2a79c0 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -10,6 +10,7 @@
  * Copyright (c) 2015-2020 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -418,5 +419,94 @@ OMPI_DECLSPEC int ompi_datatype_pack_external_size( const char datarep[], int in
         }                                                               \
     }
 
+/*
+ * Sometimes it's faster to operate on a (count,datatype) pair if it's
+ * converted to (1,larger_datatype).  This comes up in pack/unpack if
+ * the datatype is [int4b,empty4b] for example.  With that datatype the
+ * (count,datatype) path has to loop over the count processing each
+ * occurrance of the datatype, but a larger type created via
+ * MPI_Type_contiguous(count,datatype,) will have a single description
+ * entry describing the whole vector and go through pack/unpack much
+ * faster.
+ *
+ * These functions convert an incoming (count,dt) if the performance
+ * is potentially better.
+ *
+ * Note this function is only likely to be useful if the (count,datatype)
+ * describes a simple evenly spaced vector that will boil down to a
+ * single description element, but I don't think it's cheap to traverse
+ * the incoming datatype to check if that will be the case.  Eg I'm not
+ * sure it would be cheap enough to check that
+ *   [int,int,space,int,int,space]  is going to convert nicely, vs
+ *   [int,int,space,int,space]      which isn't.
+ * So the only checks performed are that the (count,datatype) isn't
+ * contiguous, and that the count is large enough to justify the
+ * overhead of making a new datatype.
+ */
+typedef struct {
+    MPI_Datatype dt;
+    MPI_Count count;
+    int new_type_was_created;
+} ompi_datatype_consolidate_t;
+
+static inline int
+ompi_datatype_consolidate_create(
+    MPI_Count count, MPI_Datatype dtype, ompi_datatype_consolidate_t *dtmod,
+    int threshold)
+{
+    int rc;
+    size_t dtsize;
+    MPI_Aint lb, extent;
+
+    /* default (do nothing) unless we decide otherwise below */
+    dtmod->dt = dtype;
+    dtmod->count = count;
+    dtmod->new_type_was_created = 0;
+
+    if (count >= threshold) {
+        opal_datatype_type_size ( &dtype->super, &dtsize);
+        rc = ompi_datatype_get_extent( dtype, &lb, &extent );
+        if (rc != OMPI_SUCCESS) { return rc; }
+        if ((dtype->super.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) &&
+            (MPI_Aint)dtsize == extent)
+        {
+            /* contig, no performance advantage to making a new type */
+        } else {
+            rc = ompi_datatype_create_contiguous( count, dtype, &dtmod->dt );
+            if (rc != OMPI_SUCCESS) { return rc; }
+            ompi_datatype_commit(&dtmod->dt);
+            dtmod->count = 1;
+            dtmod->new_type_was_created = 1;
+        }
+    }
+    return OMPI_SUCCESS;
+}
+static inline int
+ompi_datatype_consolidate_free(ompi_datatype_consolidate_t *dtmod)
+{
+    int rc = OMPI_SUCCESS;
+    if (dtmod->new_type_was_created) {
+        rc = ompi_datatype_destroy( &dtmod->dt );
+        /* caller isn't supposed to free twice, but safety valve if they do: */
+        dtmod->new_type_was_created = 0;
+    }
+    return rc;
+}
+/*
+ *  The magic number below just came from empirical testing on a couple
+ *  local PPC machines using [int,space] as the datatype.  There's some
+ *  overhead in constructing a new datatype, so just walking a sequence of
+ *  description elements is better for a short list of elements vs
+ *  creating a potentially shorter list and hoping the vector-walking
+ *  of the new elements is faster.  This could maybe be tuned dynamically
+ *  but it doesn't really seem worth it.
+ *
+ *  I only tested on two machines, the crossover point for pack and unpack
+ *  were 80 and 62 on one machine, and 250 and 220 on the other.  So I lean
+ *  toward using 250 for both and assuming that's likely to not waste too
+ *  much overhead on the datatype creation for most cases.
+ */
+#define OMPI_DATATYPE_CONSOLIDATE_THRESHOLD 250
+
 END_C_DECLS
 #endif  /* OMPI_DATATYPE_H_HAS_BEEN_INCLUDED */
diff --git a/ompi/mpi/c/pack.c b/ompi/mpi/c/pack.c
index 249186fbe6..6c92f53003 100644
--- a/ompi/mpi/c/pack.c
+++ b/ompi/mpi/c/pack.c
@@ -15,6 +15,7 @@
  *                         reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -77,10 +78,25 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
 
     OPAL_CR_ENTER_LIBRARY();
 
+    /*
+     * If a datatype's description contains a single element that describes
+     * a large vector that path is reasonably optimized in pack/unpack. On
+     * the other hand if the count and datatype combined describe the same
+     * vector, that gets processed one element at a time.
+     *
+     * So at the top level we morph the call if the count and datatype look
+     * like a good vector.
+     */
+    ompi_datatype_consolidate_t dtmod;
+    rc = ompi_datatype_consolidate_create(incount, datatype, &dtmod,
+        OMPI_DATATYPE_CONSOLIDATE_THRESHOLD);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OBJ_CONSTRUCT( &local_convertor, opal_convertor_t );
     /* the resulting convertor will be set to the position ZERO */
-    opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, &(datatype->super),
-                                              incount, (void *) inbuf, 0, &local_convertor );
+    opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor,
+                                              &(dtmod.dt->super), dtmod.count,
+                                              (void *) inbuf, 0, &local_convertor );
 
     /* Check for truncation */
     opal_convertor_get_packed_size( &local_convertor, &size );
@@ -100,6 +116,9 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
     *position += size;
     OBJ_DESTRUCT( &local_convertor );
 
+    rc = ompi_datatype_consolidate_free(&dtmod);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OPAL_CR_EXIT_LIBRARY();
 
     /* All done.  Note that the convertor returns 1 upon success, not
diff --git a/ompi/mpi/c/unpack.c b/ompi/mpi/c/unpack.c
index eb482e7481..ff61ed38d4 100644
--- a/ompi/mpi/c/unpack.c
+++ b/ompi/mpi/c/unpack.c
@@ -12,6 +12,7 @@
  * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -79,12 +80,27 @@ int MPI_Unpack(const void *inbuf, int insize, int *position,
 
     OPAL_CR_ENTER_LIBRARY();
 
+   /*
+    * If a datatype's description contains a single element that describes
+    * a large vector that path is reasonably optimized in pack/unpack. On
+    * the other hand if the count and datatype combined describe the same
+    * vector that is processed one element at a time.
+    *
+    * So at the top level we morph the call if the count and datatype look
+    * like a good vector.
+    */
+    ompi_datatype_consolidate_t dtmod;
+    rc = ompi_datatype_consolidate_create(outcount, datatype, &dtmod,
+        OMPI_DATATYPE_CONSOLIDATE_THRESHOLD);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     if( insize > 0 ) {
         int ret;
         OBJ_CONSTRUCT( &local_convertor, opal_convertor_t );
         /* the resulting convertor will be set the the position ZERO */
-        opal_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, &(datatype->super),
-                                                  outcount, outbuf, 0, &local_convertor );
+        opal_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor,
+                                                  &(dtmod.dt->super), dtmod.count,
+                                                  outbuf, 0, &local_convertor );
 
         /* Check for truncation */
         opal_convertor_get_packed_size( &local_convertor, &size );
@@ -110,6 +126,9 @@ int MPI_Unpack(const void *inbuf, int insize, int *position,
         }
     }
 
+    rc = ompi_datatype_consolidate_free(&dtmod);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OPAL_CR_EXIT_LIBRARY();
 
     OMPI_ERRHANDLER_RETURN(rc, comm, MPI_ERR_UNKNOWN, FUNC_NAME);
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index daaaa8e4b0..36d13eff3b 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -17,6 +17,7 @@
 # Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
 # Copyright (c) 2018      Research Organization for Information Science
 #                         and Technology (RIST). All rights reserved.
+# Copyright (c) 2021      IBM Corporation. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -32,6 +33,7 @@ headers = \
         opal_datatype_internal.h \
         opal_datatype_copy.h \
         opal_datatype_memcpy.h \
+        opal_datatype_pack_unpack_predefined.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
         opal_datatype_unpack.h
diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h
index 04915cddcb..b14acdf616 100644
--- a/opal/datatype/opal_datatype_internal.h
+++ b/opal/datatype/opal_datatype_internal.h
@@ -17,6 +17,7 @@
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -334,119 +335,205 @@ struct opal_datatype_t;
         .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY_UNAVAILABLE                        \
     }
 
+#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT1(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT2(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT4(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT8(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_INT16(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT1(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT2(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT4(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT8(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_UINT16(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_FLOAT2(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_FLOAT4(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_FLOAT8(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)               \
+             OPAL_DATATYPE_HANDLE_FLOAT12(                     \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)               \
+             OPAL_DATATYPE_HANDLE_FLOAT16(                     \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS)   \
+             OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(         \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS)         \
+             OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(               \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS)        \
+             OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(              \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS)   \
+             OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(         \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_BOOL(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_WCHAR(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
 #define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP_S, FLAGS )
 #define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, LOOP_E, FLAGS )
 #define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
 #define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
+
+
+#define OPAL_DATATYPE_HANDLE_INT1(AV, NOTAV, FLAGS)       AV( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT2(AV, NOTAV, FLAGS)       AV( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT4(AV, NOTAV, FLAGS)       AV( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT8(AV, NOTAV, FLAGS)       AV( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
 #ifdef HAVE_INT128_T
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS)      AV( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS)      NOTAV( INT16, FLAGS )
 #endif
-#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT1(AV, NOTAV, FLAGS)      AV( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT2(AV, NOTAV, FLAGS)      AV( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT4(AV, NOTAV, FLAGS)      AV( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT8(AV, NOTAV, FLAGS)      AV( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
 #ifdef HAVE_UINT128_T
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS)     AV( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS)     NOTAV( INT16, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS )
 #elif SIZEOF_FLOAT == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
 #elif SIZEOF_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT2, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     NOTAV( FLOAT2, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT4, FLAGS )
 #elif SIZEOF_FLOAT == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
 #elif SIZEOF_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT4, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     NOTAV( FLOAT4, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT8, FLAGS )
 #elif SIZEOF_FLOAT == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
 #elif SIZEOF_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT8, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     NOTAV( FLOAT8, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT12, FLAGS )
 #elif SIZEOF_FLOAT == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
 #elif SIZEOF_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT12, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    NOTAV( FLOAT12, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT16, FLAGS )
 #elif SIZEOF_FLOAT == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
 #elif SIZEOF_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT16, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    NOTAV( FLOAT16, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float _Complex, OPAL_ALIGNMENT_SHORT_FLOAT_COMPLEX, SHORT_FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( short float _Complex, OPAL_ALIGNMENT_SHORT_FLOAT_COMPLEX, SHORT_FLOAT_COMPLEX, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_complex_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, SHORT_FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( opal_short_float_complex_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, SHORT_FLOAT_COMPLEX, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( SHORT_FLOAT_COMPLEX, FLAGS)
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) NOTAV( SHORT_FLOAT_COMPLEX, FLAGS)
 #endif
 
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
 
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) AV( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
 
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) AV( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
 
-#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
+#define OPAL_DATATYPE_HANDLE_BOOL(AV, NOTAV, FLAGS)       AV( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
 
 #if OPAL_ALIGNMENT_WCHAR != 0
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
+#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS)      AV( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( WCHAR, FLAGS )
+#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS)      NOTAV( WCHAR, FLAGS )
 #endif
 
 #define BASIC_DDT_FROM_ELEM( ELEM ) (opal_datatype_basicDatatypes[(ELEM).elem.common.type])
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index d200f911fd..2031a005e7 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -8,6 +8,7 @@
  * Copyright (c) 2017-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2020      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2020-2021 IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,6 +20,7 @@
 #define OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
+#include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
 
 #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
 /* Make use of existing macro to do CUDA style memcpy */
@@ -107,7 +109,14 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
     /* premptively update the number of COUNT we will return. */
     *(COUNT) -= cando_count;
 
-    if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
+    if(_elem->blocklen < 9) {
+        if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem)))   {
+            goto update_and_return;
+        }
+        /* else unrecognized _elem->common.type, use the memcpy path */
+    }
+
+    if(_elem->blocklen == 1) {
         for(; cando_count > 0; cando_count--) {
             OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
                                              (CONVERTOR)->pDesc, (CONVERTOR)->count );
diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h
new file mode 100644
index 0000000000..c516feb511
--- /dev/null
+++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2020-2021 IBM Corporation. All rights reserved.
+ * Copyright (c) 2002      University of Chicago
+ * Copyright (c) 2001      Argonne National Laboratory
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * This file is based on MPICH code which contained the following
+ * notice in their top-level COPYRIGHT file:
+ *
+ *                    COPYRIGHT
+ *
+ * The following is a notice of limited availability of the code, and disclaimer
+ * which must be included in the prologue of the code and in all source listings
+ * of the code.
+ *
+ * Copyright Notice
+ *  + 2002 University of Chicago
+ *
+ * Permission is hereby granted to use, reproduce, prepare derivative works, and
+ * to redistribute to others.  This software was authored by:
+ *
+ * Mathematics and Computer Science Division
+ * Argonne National Laboratory, Argonne IL 60439
+ *
+ * (and)
+ *
+ * Department of Computer Science
+ * University of Illinois at Urbana-Champaign
+ *
+ *                   GOVERNMENT LICENSE
+ *
+ * Portions of this material resulted from work developed under a U.S.
+ * Government Contract and are subject to the following license: the Government
+ * is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable worldwide license in this computer software to reproduce, prepare
+ * derivative works, and perform publicly and display publicly.
+ *
+ *                   DISCLAIMER
+ *
+ * This computer code material was prepared, in part, as an account of work
+ * sponsored by an agency of the United States Government.  Neither the United
+ * States, nor the University of Chicago, nor any of their employees, makes any
+ * warranty express or implied, or assumes any legal liability or responsibility
+ * for the accuracy, completeness, or usefulness of any information, apparatus,
+ * product, or process disclosed, or represents that its use would not infringe
+ * privately owned rights.
+ *
+ * $HEADER$
+ */
+
+#ifndef OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED
+
+#include "opal_config.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#include <stdint.h>
+
+/*  Improve predefined pack/unpack performance using mpich methods.
+ *
+ *   For reference implementation, see:
+ *   https://github.com/pmodels/mpich/blob/9ab5fd06af2a648bf24214f0d9cff0ee77ee3e7d/src/mpi/datatype/veccpy.h
+ *
+ *   The overhead of memcpy() was causing slowdown in the
+ *   performance of predefined pack/unpack routines. So implement a
+ *   manual copy for blocklengths of <= 8. It may also be useful to
+ *   do a manual copy for larger blocklengths, but more data will have
+ *   to be gathered to see if an implementation would be
+ *   better over the current implementation.
+*/
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(stride, blocklen) { \
+  for (; i; i--) {            \
+    *_dest   = *_src;         \
+     _src    += stride;       \
+     _dest   += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_TWO(stride, blocklen) { \
+  for (; i > 1; i -= 2) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_THREE(stride, blocklen) { \
+  for (; i > 2; i -= 3) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FOUR(stride, blocklen) { \
+  for (; i > 3; i -= 4) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FIVE(stride, blocklen) { \
+  for (; i > 4; i -= 5) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SIX(stride, blocklen) { \
+  for (; i > 5; i -= 6) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _dest[5]  = _src[5];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SEVEN(stride, blocklen) { \
+  for (; i > 6; i -= 7) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _dest[5]  = _src[5];      \
+    _dest[6]  = _src[6];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_EIGHT(stride, blocklen) { \
+  for (; i > 7; i -= 8) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _dest[5]  = _src[5];      \
+    _dest[6]  = _src[6];      \
+    _dest[7]  = _src[7];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_RESIDUAL_DATA() { \
+  if(i != 0) {                          \
+   for (; i > 0; i--) {                 \
+     *_dest++ = *_src++;                \
+   }                                    \
+  }                                     \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src_base, dest_base, count, stride, blocklen, type) { \
+  type* _src  = (type *) src_base;                                        \
+  type* _dest = (type *) dest_base;                                       \
+  register unsigned long i = count;                                       \
+  if(blocklen == 1) {                                                     \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(stride, blocklen);         \
+  }                                                                       \
+  else if (blocklen == 2) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_TWO(stride, blocklen);         \
+  }                                                                       \
+  else if (blocklen == 3) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_THREE(stride, blocklen);       \
+  }                                                                       \
+  else if (blocklen == 4) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FOUR(stride, blocklen);        \
+  }                                                                       \
+  else if (blocklen == 5) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FIVE(stride, blocklen);        \
+  }                                                                       \
+  else if (blocklen == 6) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SIX(stride, blocklen);         \
+  }                                                                       \
+  else if (blocklen == 7) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SEVEN(stride, blocklen);       \
+  }                                                                       \
+  else if (blocklen == 8) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_EIGHT(stride, blocklen);       \
+  }                                                                       \
+  OPAL_DATATYPE_PACK_PREDEFINED_RESIDUAL_DATA()                           \
+  src_base  = (unsigned char *) _src;                                     \
+  dest_base = (unsigned char *) _dest;                                    \
+}                                                                         \
+
+#define OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src_base, dest_base, count, stride, blocklen, type) { \
+  type* _src  = (type *) src_base;                                        \
+  type* _dest = (type *) dest_base;                                       \
+  register unsigned long i = count;                                       \
+  /* (reversing the meanings of blocklen and stride and using the "PACK" macro) */ \
+  if(blocklen == 1) {                                                     \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(blocklen, stride);         \
+  }                                                                       \
+  else if (blocklen == 2) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_TWO(blocklen, stride);         \
+  }                                                                       \
+  else if (blocklen == 3) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_THREE(blocklen, stride);       \
+  }                                                                       \
+  else if (blocklen == 4) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FOUR(blocklen, stride);        \
+  }                                                                       \
+  else if (blocklen == 5) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FIVE(blocklen, stride);        \
+  }                                                                       \
+  else if (blocklen == 6) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SIX(blocklen, stride);         \
+  }                                                                       \
+  else if (blocklen == 7) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SEVEN(blocklen, stride);       \
+  }                                                                       \
+  else if (blocklen == 8) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_EIGHT(blocklen, stride);       \
+  }                                                                       \
+  OPAL_DATATYPE_PACK_PREDEFINED_RESIDUAL_DATA()                           \
+  src_base  = (unsigned char *) _src;                                     \
+  dest_base = (unsigned char *) _dest;                                    \
+}                                                                         \
+
+static inline int
+opal_datatype_unpack_predefined_element( unsigned char** rtn_src,
+                                unsigned char** rtn_dest,
+                                size_t cando_count,
+                                const ddt_elem_desc_t* elem)
+{
+    size_t stride; // elem's extent but in terms of count rather than bytes
+    size_t blocklen;
+    int id;
+    int align;
+
+    id = elem->common.type;
+    blocklen = elem->blocklen;
+    stride = elem->extent / opal_datatype_basicDatatypes[id]->size;
+    align = opal_datatype_basicDatatypes[id]->align;
+
+    unsigned char *src = *rtn_src;
+    unsigned char *dest = *rtn_dest;
+
+#if OPAL_CUDA_SUPPORT
+    if (opal_cuda_check_bufs(dest, src)) {
+        return OPAL_ERROR;
+    }
+/*
+ *  For checking if elem contains cuda memory, I think it's mostly okay
+ *  to only check the first element as done above.  Although a complete
+ *  MPI datatype could easily be made to span both gpu and system memory,
+ *  I don't think that's true for the individual vector elements that make
+ *  up a datatype's description.  The only way I can even conceive of that
+ *  being untrue is if the element has only two entries with a crazy
+ *  extent sized to hit both locations.  I don't really think that's
+ *  possible, but I'm checking it anyway below.
+ */
+    if (elem->count == 2 && cando_count >= blocklen &&
+       (opal_cuda_check_bufs(dest + elem->extent, src)))
+    {
+        return OPAL_ERROR;
+    }
+#endif
+  if ((uintptr_t)src % align  ||
+      (uintptr_t)dest % align ||
+      (elem->extent % align && cando_count > blocklen))
+  {
+      return OPAL_ERROR;
+  }
+
+/*
+ *  Here as an example of how we want to call our macro, if the incoming id
+ *  were OPAL_DATATYPE_INT4, we want
+ *    OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, int32_t);
+ *  which requires us to know that int32_t is what an OPAL_DATATYPE_INT4 is.  That's
+ *  fine for INT4 but for others like FLOAT12 it's more involved.
+ *
+ *  There's already a lot of code in opal_datatype_internal.h to maintain those
+ *  mappings, so we recycle it by calling OPAL_DATATYPE_HANDLE_INT4()
+ *  etc which are the same macros that decide that INT4 is int32_t and
+ *  that some of the base opal types aren't available.
+ *
+ *  Otherwise we'd have to copy and maintain essentially the same blob of
+ *  macros that already exist in opal_datatype_internal.h.
+ */
+#define OPAL_DATATYPE_MYUNPACK(NAME)               \
+    do {                                           \
+        OPAL_DATATYPE_HANDLE_ ## NAME(             \
+            OPAL_DATATYPE_MYUNPACK_AVAILABLE,      \
+            OPAL_DATATYPE_MYUNPACK_NOTAVAIL, 0);   \
+    } while (0)
+
+#define OPAL_DATATYPE_MYUNPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
+    do { \
+        OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
+        success = true; \
+    } while (0)
+
+#define OPAL_DATATYPE_MYUNPACK_NOTAVAIL(NAME, unused) \
+    do { \
+        success = false; \
+    } while (0)
+
+    bool success = false;
+    switch(id) {
+        case OPAL_DATATYPE_INT1:
+            // The below macro should expand to
+            // OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, int8_t);
+            // by using OPAL_DATATYPE_HANDLE_* where it finds that INT1 means int8_t etc
+            OPAL_DATATYPE_MYUNPACK(INT1);
+            break;
+        case OPAL_DATATYPE_INT2:
+            OPAL_DATATYPE_MYUNPACK(INT2);
+            break;
+        case OPAL_DATATYPE_INT4:
+            OPAL_DATATYPE_MYUNPACK(INT4);
+            break;
+        case OPAL_DATATYPE_INT8:
+            OPAL_DATATYPE_MYUNPACK(INT8);
+            break;
+        case OPAL_DATATYPE_INT16:
+            OPAL_DATATYPE_MYUNPACK(INT16);
+            break;
+        case OPAL_DATATYPE_UINT1:
+            OPAL_DATATYPE_MYUNPACK(UINT1);
+            break;
+        case OPAL_DATATYPE_UINT2:
+            OPAL_DATATYPE_MYUNPACK(UINT2);
+            break;
+        case OPAL_DATATYPE_UINT4:
+            OPAL_DATATYPE_MYUNPACK(UINT4);
+            break;
+        case OPAL_DATATYPE_UINT8:
+            OPAL_DATATYPE_MYUNPACK(UINT8);
+            break;
+        case OPAL_DATATYPE_UINT16:
+            OPAL_DATATYPE_MYUNPACK(UINT16);
+            break;
+        case OPAL_DATATYPE_FLOAT2:
+            OPAL_DATATYPE_MYUNPACK(FLOAT2);
+            break;
+        case OPAL_DATATYPE_FLOAT4:
+            OPAL_DATATYPE_MYUNPACK(FLOAT4);
+            break;
+        case OPAL_DATATYPE_FLOAT8:
+            OPAL_DATATYPE_MYUNPACK(FLOAT8);
+            break;
+        case OPAL_DATATYPE_FLOAT12:
+            OPAL_DATATYPE_MYUNPACK(FLOAT12);
+            break;
+        case OPAL_DATATYPE_FLOAT16:
+            OPAL_DATATYPE_MYUNPACK(FLOAT16);
+            break;
+        case OPAL_DATATYPE_SHORT_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(SHORT_FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_LONG_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(LONG_DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_BOOL:
+            OPAL_DATATYPE_MYUNPACK(BOOL);
+            break;
+        case OPAL_DATATYPE_WCHAR:
+            OPAL_DATATYPE_MYUNPACK(WCHAR);
+            break;
+        default:
+            return OPAL_ERROR;
+    }
+    if (!success) {
+        return OPAL_ERROR;
+    }
+
+    *rtn_src  = src;
+    *rtn_dest = dest;
+    return OPAL_SUCCESS;
+}
+
+static inline int
+opal_datatype_pack_predefined_element( unsigned char** rtn_src,
+                                unsigned char** rtn_dest,
+                                size_t cando_count,
+                                const ddt_elem_desc_t* elem)
+{
+    size_t stride; // elem's extent but in terms of count rather than bytes
+    size_t blocklen;
+    int id;
+    int align;
+
+    id = elem->common.type;
+    blocklen = elem->blocklen;
+    stride = elem->extent / opal_datatype_basicDatatypes[id]->size;
+    align = opal_datatype_basicDatatypes[id]->align;
+
+    unsigned char *src = *rtn_src;
+    unsigned char *dest = *rtn_dest;
+
+#if OPAL_CUDA_SUPPORT
+    if (opal_cuda_check_bufs(dest, src)) {
+        return OPAL_ERROR;
+    }
+    if (elem->count == 2 && cando_count >= blocklen &&
+       (opal_cuda_check_bufs(dest, src + elem->extent)))
+    {
+        return OPAL_ERROR;
+    }
+#endif
+  if ((uintptr_t)src % align  ||
+      (uintptr_t)dest % align ||
+      (elem->extent % align && cando_count > blocklen))
+  {
+      return OPAL_ERROR;
+  }
+
+#define OPAL_DATATYPE_MYPACK(NAME)                 \
+    do {                                           \
+        OPAL_DATATYPE_HANDLE_ ## NAME(             \
+            OPAL_DATATYPE_MYPACK_AVAILABLE,        \
+            OPAL_DATATYPE_MYPACK_NOTAVAIL, 0);     \
+    } while (0)
+
+#define OPAL_DATATYPE_MYPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
+    do { \
+        OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
+        success = true; \
+    } while (0)
+
+#define OPAL_DATATYPE_MYPACK_NOTAVAIL(NAME, unused) \
+    do { \
+        success = false; \
+    } while (0)
+
+    bool success = false;
+    switch(id) {
+        case OPAL_DATATYPE_INT1:
+            // The below macro should expand to
+            // OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, int8_t);
+            // by using OPAL_DATATYPE_HANDLE_* where it finds that INT1 means int8_t etc
+            OPAL_DATATYPE_MYPACK(INT1);
+            break;
+        case OPAL_DATATYPE_INT2:
+            OPAL_DATATYPE_MYPACK(INT2);
+            break;
+        case OPAL_DATATYPE_INT4:
+            OPAL_DATATYPE_MYPACK(INT4);
+            break;
+        case OPAL_DATATYPE_INT8:
+            OPAL_DATATYPE_MYPACK(INT8);
+            break;
+        case OPAL_DATATYPE_INT16:
+            OPAL_DATATYPE_MYPACK(INT16);
+            break;
+        case OPAL_DATATYPE_UINT1:
+            OPAL_DATATYPE_MYPACK(UINT1);
+            break;
+        case OPAL_DATATYPE_UINT2:
+            OPAL_DATATYPE_MYPACK(UINT2);
+            break;
+        case OPAL_DATATYPE_UINT4:
+            OPAL_DATATYPE_MYPACK(UINT4);
+            break;
+        case OPAL_DATATYPE_UINT8:
+            OPAL_DATATYPE_MYPACK(UINT8);
+            break;
+        case OPAL_DATATYPE_UINT16:
+            OPAL_DATATYPE_MYPACK(UINT16);
+            break;
+        case OPAL_DATATYPE_FLOAT2:
+            OPAL_DATATYPE_MYPACK(FLOAT2);
+            break;
+        case OPAL_DATATYPE_FLOAT4:
+            OPAL_DATATYPE_MYPACK(FLOAT4);
+            break;
+        case OPAL_DATATYPE_FLOAT8:
+            OPAL_DATATYPE_MYPACK(FLOAT8);
+            break;
+        case OPAL_DATATYPE_FLOAT12:
+            OPAL_DATATYPE_MYPACK(FLOAT12);
+            break;
+        case OPAL_DATATYPE_FLOAT16:
+            OPAL_DATATYPE_MYPACK(FLOAT16);
+            break;
+        case OPAL_DATATYPE_SHORT_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYPACK(SHORT_FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYPACK(FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYPACK(DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_LONG_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYPACK(LONG_DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_BOOL:
+            OPAL_DATATYPE_MYPACK(BOOL);
+            break;
+        case OPAL_DATATYPE_WCHAR:
+            OPAL_DATATYPE_MYPACK(WCHAR);
+            break;
+        default:
+            return OPAL_ERROR;
+    }
+    if (!success) {
+        return OPAL_ERROR;
+    }
+
+    *rtn_src  = src;
+    *rtn_dest = dest;
+    return OPAL_SUCCESS;
+}
+#endif
diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
index 79068729a1..a786a2fc7e 100644
--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@@ -7,6 +7,7 @@
  * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2017-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2020-2021 IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -18,6 +19,7 @@
 #define OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
+#include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
 
 #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
 /* Make use of existing macro to do CUDA style memcpy */
@@ -103,6 +105,13 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
     /* premptively update the number of COUNT we will return. */
     *(COUNT) -= cando_count;
 
+    if( _elem->blocklen < 9 ) {
+        if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_unpack_predefined_element(&_packed, &_memory, cando_count, _elem))) {
+            goto update_and_return;
+        }
+        /* else unrecognized _elem->common.type, use the memcpy path */
+    }
+
     if( 1 == _elem->blocklen ) {  /* Do as many full blocklen as possible */
         for(; cando_count > 0; cando_count--) {
             OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
-- 
GitLab