diff --git a/LICENSE b/LICENSE
index 906630dcc68fe76ef22645d0b7e13ec5de6c72b1..4ed90b0f879ad4a32e3c3e9d53abd9f4d8afb0d4 100644
--- a/LICENSE
+++ b/LICENSE
@@ -58,6 +58,8 @@ Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates.  All Rights
 Copyright (c) 2018      DataDirect Networks. All rights reserved.
 Copyright (c) 2018-2020 Triad National Security, LLC. All rights reserved.
 Copyright (c) 2020      Google, LLC. All rights reserved.
+Copyright (c) 2002      University of Chicago
+Copyright (c) 2001      Argonne National Laboratory
 
 $COPYRIGHT$
 
@@ -99,3 +101,42 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------[Copyright from inclusion of MPICH code]----------------
+
+The following is a notice of limited availability of the code, and disclaimer
+which must be included in the prologue of the code and in all source listings
+of the code.
+
+Copyright Notice
+ + 2002 University of Chicago
+
+Permission is hereby granted to use, reproduce, prepare derivative works, and
+to redistribute to others.  This software was authored by:
+
+Mathematics and Computer Science Division
+Argonne National Laboratory, Argonne IL 60439
+
+(and)
+
+Department of Computer Science
+University of Illinois at Urbana-Champaign
+
+
+			      GOVERNMENT LICENSE
+
+Portions of this material resulted from work developed under a U.S.
+Government Contract and are subject to the following license: the Government
+is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+irrevocable worldwide license in this computer software to reproduce, prepare
+derivative works, and perform publicly and display publicly.
+
+				  DISCLAIMER
+
+This computer code material was prepared, in part, as an account of work
+sponsored by an agency of the United States Government.  Neither the United
+States, nor the University of Chicago, nor any of their employees, makes any
+warranty express or implied, or assumes any legal liability or responsibility
+for the accuracy, completeness, or usefulness of any information, apparatus,
+product, or process disclosed, or represents that its use would not infringe
+privately owned rights.
diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 9d91b88a5d464625c1afea8b930dac2fb308cf0a..e2ee2a79c0101d71f3710c90dc51ee76a0b0eaee 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -10,6 +10,7 @@
  * Copyright (c) 2015-2020 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -418,5 +419,94 @@ OMPI_DECLSPEC int ompi_datatype_pack_external_size( const char datarep[], int in
         }                                                               \
     }
 
+/*
+ * Sometimes it's faster to operate on a (count,datatype) pair if it's
+ * converted to (1,larger_datatype).  This comes up in pack/unpack if
+ * the datatype is [int4b,empty4b] for example.  With that datatype the
+ * (count,datatype) path has to loop over the count processing each
+ * occurrance of the datatype, but a larger type created via
+ * MPI_Type_contiguous(count,datatype,) will have a single description
+ * entry describing the whole vector and go through pack/unpack much
+ * faster.
+ *
+ * These functions convert an incoming (count,dt) if the performance
+ * is potentially better.
+ *
+ * Note this function is only likely to be useful if the (count,datatype)
+ * describes a simple evenly spaced vector that will boil down to a
+ * single description element, but I don't think it's cheap to traverse
+ * the incoming datatype to check if that will be the case.  Eg I'm not
+ * sure it would be cheap enough to check that
+ *   [int,int,space,int,int,space]  is going to convert nicely, vs
+ *   [int,int,space,int,space]      which isn't.
+ * So the only checks performed are that the (count,datatype) isn't
+ * contiguous, and that the count is large enough to justify the
+ * overhead of making a new datatype.
+ */
+typedef struct {
+    MPI_Datatype dt;
+    MPI_Count count;
+    int new_type_was_created;
+} ompi_datatype_consolidate_t;
+
+static inline int
+ompi_datatype_consolidate_create(
+    MPI_Count count, MPI_Datatype dtype, ompi_datatype_consolidate_t *dtmod,
+    int threshold)
+{
+    int rc;
+    size_t dtsize;
+    MPI_Aint lb, extent;
+
+    /* default (do nothing) unless we decide otherwise below */
+    dtmod->dt = dtype;
+    dtmod->count = count;
+    dtmod->new_type_was_created = 0;
+
+    if (count >= threshold) {
+        opal_datatype_type_size ( &dtype->super, &dtsize);
+        rc = ompi_datatype_get_extent( dtype, &lb, &extent );
+        if (rc != OMPI_SUCCESS) { return rc; }
+        if ((dtype->super.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) &&
+            (MPI_Aint)dtsize == extent)
+        {
+            /* contig, no performance advantage to making a new type */
+        } else {
+            rc = ompi_datatype_create_contiguous( count, dtype, &dtmod->dt );
+            if (rc != OMPI_SUCCESS) { return rc; }
+            ompi_datatype_commit(&dtmod->dt);
+            dtmod->count = 1;
+            dtmod->new_type_was_created = 1;
+        }
+    }
+    return OMPI_SUCCESS;
+}
+static inline int
+ompi_datatype_consolidate_free(ompi_datatype_consolidate_t *dtmod)
+{
+    int rc = OMPI_SUCCESS;
+    if (dtmod->new_type_was_created) {
+        rc = ompi_datatype_destroy( &dtmod->dt );
+        /* caller isn't supposed to free twice, but safety valve if they do: */
+        dtmod->new_type_was_created = 0;
+    }
+    return rc;
+}
+/*
+ *  The magic number below just came from empirical testing on a couple
+ *  local PPC machines using [int,space] as the datatype.  There's some
+ *  overhead in constructing a new datatype, so just walking a sequence of
+ *  description elements is better for a short list of elements vs
+ *  creating a potentially shorter list and hoping the vector-walking
+ *  of the new elements is faster.  This could maybe be tuned dynamically
+ *  but it doesn't really seem worth it.
+ *
+ *  I only tested on two machines, the crossover point for pack and unpack
+ *  were 80 and 62 on one machine, and 250 and 220 on the other.  So I lean
+ *  toward using 250 for both and assuming that's likely to not waste too
+ *  much overhead on the datatype creation for most cases.
+ */
+#define OMPI_DATATYPE_CONSOLIDATE_THRESHOLD 250
+
 END_C_DECLS
 #endif  /* OMPI_DATATYPE_H_HAS_BEEN_INCLUDED */
diff --git a/ompi/mpi/c/pack.c b/ompi/mpi/c/pack.c
index 249186fbe61faae064308d55058eab55f3222fcb..6c92f53003e136325b54fc8ef0031b4e39c8adb6 100644
--- a/ompi/mpi/c/pack.c
+++ b/ompi/mpi/c/pack.c
@@ -15,6 +15,7 @@
  *                         reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -77,10 +78,25 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
 
     OPAL_CR_ENTER_LIBRARY();
 
+    /*
+     * If a datatype's description contains a single element that describes
+     * a large vector that path is reasonably optimized in pack/unpack. On
+     * the other hand if the count and datatype combined describe the same
+     * vector, that gets processed one element at a time.
+     *
+     * So at the top level we morph the call if the count and datatype look
+     * like a good vector.
+     */
+    ompi_datatype_consolidate_t dtmod;
+    rc = ompi_datatype_consolidate_create(incount, datatype, &dtmod,
+        OMPI_DATATYPE_CONSOLIDATE_THRESHOLD);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OBJ_CONSTRUCT( &local_convertor, opal_convertor_t );
     /* the resulting convertor will be set to the position ZERO */
-    opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, &(datatype->super),
-                                              incount, (void *) inbuf, 0, &local_convertor );
+    opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor,
+                                              &(dtmod.dt->super), dtmod.count,
+                                              (void *) inbuf, 0, &local_convertor );
 
     /* Check for truncation */
     opal_convertor_get_packed_size( &local_convertor, &size );
@@ -100,6 +116,9 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
     *position += size;
     OBJ_DESTRUCT( &local_convertor );
 
+    rc = ompi_datatype_consolidate_free(&dtmod);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OPAL_CR_EXIT_LIBRARY();
 
     /* All done.  Note that the convertor returns 1 upon success, not
diff --git a/ompi/mpi/c/unpack.c b/ompi/mpi/c/unpack.c
index eb482e7481d0c43bf0b2c64d61fa5cd4ffb9248e..ff61ed38d4ceefe2b8cad62301f7013e024b5fb9 100644
--- a/ompi/mpi/c/unpack.c
+++ b/ompi/mpi/c/unpack.c
@@ -12,6 +12,7 @@
  * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -79,12 +80,27 @@ int MPI_Unpack(const void *inbuf, int insize, int *position,
 
     OPAL_CR_ENTER_LIBRARY();
 
+   /*
+    * If a datatype's description contains a single element that describes
+    * a large vector that path is reasonably optimized in pack/unpack. On
+    * the other hand if the count and datatype combined describe the same
+    * vector that is processed one element at a time.
+    *
+    * So at the top level we morph the call if the count and datatype look
+    * like a good vector.
+    */
+    ompi_datatype_consolidate_t dtmod;
+    rc = ompi_datatype_consolidate_create(outcount, datatype, &dtmod,
+        OMPI_DATATYPE_CONSOLIDATE_THRESHOLD);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     if( insize > 0 ) {
         int ret;
         OBJ_CONSTRUCT( &local_convertor, opal_convertor_t );
         /* the resulting convertor will be set the the position ZERO */
-        opal_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, &(datatype->super),
-                                                  outcount, outbuf, 0, &local_convertor );
+        opal_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor,
+                                                  &(dtmod.dt->super), dtmod.count,
+                                                  outbuf, 0, &local_convertor );
 
         /* Check for truncation */
         opal_convertor_get_packed_size( &local_convertor, &size );
@@ -110,6 +126,9 @@ int MPI_Unpack(const void *inbuf, int insize, int *position,
         }
     }
 
+    rc = ompi_datatype_consolidate_free(&dtmod);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OPAL_CR_EXIT_LIBRARY();
 
     OMPI_ERRHANDLER_RETURN(rc, comm, MPI_ERR_UNKNOWN, FUNC_NAME);
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index daaaa8e4b0781e65c142ab3d613f84ecef666669..36d13eff3b56dfe55e69accf01480d6eb19eec86 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -17,6 +17,7 @@
 # Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
 # Copyright (c) 2018      Research Organization for Information Science
 #                         and Technology (RIST). All rights reserved.
+# Copyright (c) 2021      IBM Corporation. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -32,6 +33,7 @@ headers = \
         opal_datatype_internal.h \
         opal_datatype_copy.h \
         opal_datatype_memcpy.h \
+        opal_datatype_pack_unpack_predefined.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
         opal_datatype_unpack.h
diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h
index 04915cddcbfa3c44888410664328e58ea85dc922..b14acdf61683401d2005dce880250697d0448f49 100644
--- a/opal/datatype/opal_datatype_internal.h
+++ b/opal/datatype/opal_datatype_internal.h
@@ -17,6 +17,7 @@
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -334,119 +335,205 @@ struct opal_datatype_t;
         .ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY_UNAVAILABLE                        \
     }
 
+#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT1(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT2(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT4(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_INT8(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_INT16(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT1(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT2(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT4(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_UINT8(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_UINT16(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_FLOAT2(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_FLOAT4(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)                \
+             OPAL_DATATYPE_HANDLE_FLOAT8(                      \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)               \
+             OPAL_DATATYPE_HANDLE_FLOAT12(                     \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)               \
+             OPAL_DATATYPE_HANDLE_FLOAT16(                     \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS)   \
+             OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(         \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS)         \
+             OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(               \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS)        \
+             OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(              \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS)   \
+             OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(         \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)                  \
+             OPAL_DATATYPE_HANDLE_BOOL(                        \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)                 \
+             OPAL_DATATYPE_HANDLE_WCHAR(                       \
+             OPAL_DATATYPE_INIT_BASIC_DATATYPE,                \
+             OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED, FLAGS)
 #define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP_S, FLAGS )
 #define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, LOOP_E, FLAGS )
 #define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
 #define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
+
+
+#define OPAL_DATATYPE_HANDLE_INT1(AV, NOTAV, FLAGS)       AV( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT2(AV, NOTAV, FLAGS)       AV( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT4(AV, NOTAV, FLAGS)       AV( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT8(AV, NOTAV, FLAGS)       AV( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
 #ifdef HAVE_INT128_T
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS)      AV( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_INT16(AV, NOTAV, FLAGS)      NOTAV( INT16, FLAGS )
 #endif
-#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT1(AV, NOTAV, FLAGS)      AV( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT2(AV, NOTAV, FLAGS)      AV( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT4(AV, NOTAV, FLAGS)      AV( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT8(AV, NOTAV, FLAGS)      AV( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
 #ifdef HAVE_UINT128_T
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS)     AV( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_UINT16(AV, NOTAV, FLAGS)     NOTAV( INT16, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT2, FLAGS )
 #elif SIZEOF_FLOAT == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
 #elif SIZEOF_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT2, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT2, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT2(AV, NOTAV, FLAGS)     NOTAV( FLOAT2, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT4, FLAGS )
 #elif SIZEOF_FLOAT == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
 #elif SIZEOF_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT4, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT4, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT4(AV, NOTAV, FLAGS)     NOTAV( FLOAT4, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT8, FLAGS )
 #elif SIZEOF_FLOAT == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
 #elif SIZEOF_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT8, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT8, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT8(AV, NOTAV, FLAGS)     NOTAV( FLOAT8, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT12, FLAGS )
 #elif SIZEOF_FLOAT == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
 #elif SIZEOF_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT12, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT12, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT12(AV, NOTAV, FLAGS)    NOTAV( FLOAT12, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT) && SIZEOF_SHORT_FLOAT == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( short float, OPAL_ALIGNMENT_SHORT_FLOAT, FLOAT16, FLAGS )
 #elif SIZEOF_FLOAT == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
 #elif SIZEOF_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
 #elif SIZEOF_LONG_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T) && SIZEOF_OPAL_SHORT_FLOAT_T == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    AV( opal_short_float_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, FLOAT16, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT16, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT16(AV, NOTAV, FLAGS)    NOTAV( FLOAT16, FLAGS )
 #endif
 
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( short float _Complex, OPAL_ALIGNMENT_SHORT_FLOAT_COMPLEX, SHORT_FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( short float _Complex, OPAL_ALIGNMENT_SHORT_FLOAT_COMPLEX, SHORT_FLOAT_COMPLEX, FLAGS )
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( opal_short_float_complex_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, SHORT_FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( opal_short_float_complex_t, OPAL_ALIGNMENT_OPAL_SHORT_FLOAT_T, SHORT_FLOAT_COMPLEX, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_SHORT_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( SHORT_FLOAT_COMPLEX, FLAGS)
+#define OPAL_DATATYPE_HANDLE_SHORT_FLOAT_COMPLEX(AV, NOTAV, FLAGS) NOTAV( SHORT_FLOAT_COMPLEX, FLAGS)
 #endif
 
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_FLOAT_COMPLEX(AV, NOTAV, FLAGS) AV( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
 
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) AV( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
 
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
+#define OPAL_DATATYPE_HANDLE_LONG_DOUBLE_COMPLEX(AV, NOTAV, FLAGS) AV( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
 
-#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
+#define OPAL_DATATYPE_HANDLE_BOOL(AV, NOTAV, FLAGS)       AV( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
 
 #if OPAL_ALIGNMENT_WCHAR != 0
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
+#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS)      AV( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
 #else
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( WCHAR, FLAGS )
+#define OPAL_DATATYPE_HANDLE_WCHAR(AV, NOTAV, FLAGS)      NOTAV( WCHAR, FLAGS )
 #endif
 
 #define BASIC_DDT_FROM_ELEM( ELEM ) (opal_datatype_basicDatatypes[(ELEM).elem.common.type])
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index d200f911fddc57ec4d1a3c6f1f81f9aa4996e978..2031a005e70592bcc77219eb1c64a2b951492c35 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -8,6 +8,7 @@
  * Copyright (c) 2017-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2020      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2020-2021 IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,6 +20,7 @@
 #define OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
+#include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
 
 #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
 /* Make use of existing macro to do CUDA style memcpy */
@@ -107,7 +109,14 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
     /* premptively update the number of COUNT we will return. */
     *(COUNT) -= cando_count;
 
-    if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
+    if(_elem->blocklen < 9) {
+        if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem)))   {
+            goto update_and_return;
+        }
+        /* else unrecognized _elem->common.type, use the memcpy path */
+    }
+
+    if(_elem->blocklen == 1) {
         for(; cando_count > 0; cando_count--) {
             OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
                                              (CONVERTOR)->pDesc, (CONVERTOR)->count );
diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h
new file mode 100644
index 0000000000000000000000000000000000000000..c516feb511df6b74707ebcb020274b7dc2190a19
--- /dev/null
+++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2020-2021 IBM Corporation. All rights reserved.
+ * Copyright (c) 2002      University of Chicago
+ * Copyright (c) 2001      Argonne National Laboratory
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * This file is based on MPICH code which contained the following
+ * notice in their top-level COPYRIGHT file:
+ *
+ *                    COPYRIGHT
+ *
+ * The following is a notice of limited availability of the code, and disclaimer
+ * which must be included in the prologue of the code and in all source listings
+ * of the code.
+ *
+ * Copyright Notice
+ *  + 2002 University of Chicago
+ *
+ * Permission is hereby granted to use, reproduce, prepare derivative works, and
+ * to redistribute to others.  This software was authored by:
+ *
+ * Mathematics and Computer Science Division
+ * Argonne National Laboratory, Argonne IL 60439
+ *
+ * (and)
+ *
+ * Department of Computer Science
+ * University of Illinois at Urbana-Champaign
+ *
+ *                   GOVERNMENT LICENSE
+ *
+ * Portions of this material resulted from work developed under a U.S.
+ * Government Contract and are subject to the following license: the Government
+ * is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable worldwide license in this computer software to reproduce, prepare
+ * derivative works, and perform publicly and display publicly.
+ *
+ *                   DISCLAIMER
+ *
+ * This computer code material was prepared, in part, as an account of work
+ * sponsored by an agency of the United States Government.  Neither the United
+ * States, nor the University of Chicago, nor any of their employees, makes any
+ * warranty express or implied, or assumes any legal liability or responsibility
+ * for the accuracy, completeness, or usefulness of any information, apparatus,
+ * product, or process disclosed, or represents that its use would not infringe
+ * privately owned rights.
+ *
+ * $HEADER$
+ */
+
+#ifndef OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED
+
+#include "opal_config.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#include <stdint.h>
+
+/*  Improve predefined pack/unpack performance using mpich methods.
+ *
+ *   For reference implementation, see:
+ *   https://github.com/pmodels/mpich/blob/9ab5fd06af2a648bf24214f0d9cff0ee77ee3e7d/src/mpi/datatype/veccpy.h
+ *
+ *   The overhead of memcpy() was causing slowdown in the
+ *   performance of predefined pack/unpack routines. So implement a
+ *   manual copy for blocklengths of <= 8. It may also be useful to
+ *   do a manual copy for larger blocklengths, but more data will have
+ *   to be gathered to see if an implementation would be
+ *   better over the current implementation.
+*/
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(stride, blocklen) { \
+  for (; i; i--) {            \
+    *_dest   = *_src;         \
+     _src    += stride;       \
+     _dest   += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_TWO(stride, blocklen) { \
+  for (; i > 1; i -= 2) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_THREE(stride, blocklen) { \
+  for (; i > 2; i -= 3) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FOUR(stride, blocklen) { \
+  for (; i > 3; i -= 4) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FIVE(stride, blocklen) { \
+  for (; i > 4; i -= 5) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SIX(stride, blocklen) { \
+  for (; i > 5; i -= 6) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _dest[5]  = _src[5];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SEVEN(stride, blocklen) { \
+  for (; i > 6; i -= 7) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _dest[5]  = _src[5];      \
+    _dest[6]  = _src[6];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_EIGHT(stride, blocklen) { \
+  for (; i > 7; i -= 8) {     \
+    _dest[0]  = _src[0];      \
+    _dest[1]  = _src[1];      \
+    _dest[2]  = _src[2];      \
+    _dest[3]  = _src[3];      \
+    _dest[4]  = _src[4];      \
+    _dest[5]  = _src[5];      \
+    _dest[6]  = _src[6];      \
+    _dest[7]  = _src[7];      \
+    _src     += stride;       \
+    _dest    += blocklen;     \
+  }                           \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_RESIDUAL_DATA() { \
+  if(i != 0) {                          \
+   for (; i > 0; i--) {                 \
+     *_dest++ = *_src++;                \
+   }                                    \
+  }                                     \
+}
+
+#define OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src_base, dest_base, count, stride, blocklen, type) { \
+  type* _src  = (type *) src_base;                                        \
+  type* _dest = (type *) dest_base;                                       \
+  register unsigned long i = count;                                       \
+  if(blocklen == 1) {                                                     \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(stride, blocklen);         \
+  }                                                                       \
+  else if (blocklen == 2) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_TWO(stride, blocklen);         \
+  }                                                                       \
+  else if (blocklen == 3) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_THREE(stride, blocklen);       \
+  }                                                                       \
+  else if (blocklen == 4) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FOUR(stride, blocklen);        \
+  }                                                                       \
+  else if (blocklen == 5) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FIVE(stride, blocklen);        \
+  }                                                                       \
+  else if (blocklen == 6) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SIX(stride, blocklen);         \
+  }                                                                       \
+  else if (blocklen == 7) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SEVEN(stride, blocklen);       \
+  }                                                                       \
+  else if (blocklen == 8) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_EIGHT(stride, blocklen);       \
+  }                                                                       \
+  OPAL_DATATYPE_PACK_PREDEFINED_RESIDUAL_DATA()                           \
+  src_base  = (unsigned char *) _src;                                     \
+  dest_base = (unsigned char *) _dest;                                    \
+}                                                                         \
+
+#define OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src_base, dest_base, count, stride, blocklen, type) { \
+  type* _src  = (type *) src_base;                                        \
+  type* _dest = (type *) dest_base;                                       \
+  register unsigned long i = count;                                       \
+  /* (reversing the meanings of blocklen and stride and using the "PACK" macro) */ \
+  if(blocklen == 1) {                                                     \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(blocklen, stride);         \
+  }                                                                       \
+  else if (blocklen == 2) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_TWO(blocklen, stride);         \
+  }                                                                       \
+  else if (blocklen == 3) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_THREE(blocklen, stride);       \
+  }                                                                       \
+  else if (blocklen == 4) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FOUR(blocklen, stride);        \
+  }                                                                       \
+  else if (blocklen == 5) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_FIVE(blocklen, stride);        \
+  }                                                                       \
+  else if (blocklen == 6) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SIX(blocklen, stride);         \
+  }                                                                       \
+  else if (blocklen == 7) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_SEVEN(blocklen, stride);       \
+  }                                                                       \
+  else if (blocklen == 8) {                                               \
+    OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_EIGHT(blocklen, stride);       \
+  }                                                                       \
+  OPAL_DATATYPE_PACK_PREDEFINED_RESIDUAL_DATA()                           \
+  src_base  = (unsigned char *) _src;                                     \
+  dest_base = (unsigned char *) _dest;                                    \
+}                                                                         \
+
+static inline int
+opal_datatype_unpack_predefined_element( unsigned char** rtn_src,
+                                unsigned char** rtn_dest,
+                                size_t cando_count,
+                                const ddt_elem_desc_t* elem)
+{
+    size_t stride; // elem's extent but in terms of count rather than bytes
+    size_t blocklen;
+    int id;
+    int align;
+
+    id = elem->common.type;
+    blocklen = elem->blocklen;
+    stride = elem->extent / opal_datatype_basicDatatypes[id]->size;
+    align = opal_datatype_basicDatatypes[id]->align;
+
+    unsigned char *src = *rtn_src;
+    unsigned char *dest = *rtn_dest;
+
+#if OPAL_CUDA_SUPPORT
+    if (opal_cuda_check_bufs(dest, src)) {
+        return OPAL_ERROR;
+    }
+/*
+ *  For checking if elem contains cuda memory, I think it's mostly okay
+ *  to only check the first element as done above.  Although a complete
+ *  MPI datatype could easily be made to span both gpu and system memory,
+ *  I don't think that's true for the individual vector elements that make
+ *  up a datatype's description.  The only way I can even conceive of that
+ *  being untrue is if the element has only two entries with a crazy
+ *  extent sized to hit both locations.  I don't really think that's
+ *  possible, but I'm checking it anyway below.
+ */
+    if (elem->count == 2 && cando_count >= blocklen &&
+       (opal_cuda_check_bufs(dest + elem->extent, src)))
+    {
+        return OPAL_ERROR;
+    }
+#endif
+  if ((uintptr_t)src % align  ||
+      (uintptr_t)dest % align ||
+      (elem->extent % align && cando_count > blocklen))
+  {
+      return OPAL_ERROR;
+  }
+
+/*
+ *  Here as an example of how we want to call our macro, if the incoming id
+ *  were OPAL_DATATYPE_INT4, we want
+ *    OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, int32_t);
+ *  which requires us to know that int32_t is what an OPAL_DATATYPE_INT4 is.  That's
+ *  fine for INT4 but for others like FLOAT12 it's more involved.
+ *
+ *  There's already a lot of code in opal_datatype_internal.h to maintain those
+ *  mappings, so we recycle it by calling OPAL_DATATYPE_HANDLE_INT4()
+ *  etc which are the same macros that decide that INT4 is int32_t and
+ *  that some of the base opal types aren't available.
+ *
+ *  Otherwise we'd have to copy and maintain essentially the same blob of
+ *  macros that already exist in opal_datatype_internal.h.
+ */
+#define OPAL_DATATYPE_MYUNPACK(NAME)               \
+    do {                                           \
+        OPAL_DATATYPE_HANDLE_ ## NAME(             \
+            OPAL_DATATYPE_MYUNPACK_AVAILABLE,      \
+            OPAL_DATATYPE_MYUNPACK_NOTAVAIL, 0);   \
+    } while (0)
+
+#define OPAL_DATATYPE_MYUNPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
+    do { \
+        OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
+        success = true; \
+    } while (0)
+
+#define OPAL_DATATYPE_MYUNPACK_NOTAVAIL(NAME, unused) \
+    do { \
+        success = false; \
+    } while (0)
+
+    bool success = false;
+    switch(id) {
+        case OPAL_DATATYPE_INT1:
+            // The below macro should expand to
+            // OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, int8_t);
+            // by using OPAL_DATATYPE_HANDLE_* where it finds that INT1 means int8_t etc
+            OPAL_DATATYPE_MYUNPACK(INT1);
+            break;
+        case OPAL_DATATYPE_INT2:
+            OPAL_DATATYPE_MYUNPACK(INT2);
+            break;
+        case OPAL_DATATYPE_INT4:
+            OPAL_DATATYPE_MYUNPACK(INT4);
+            break;
+        case OPAL_DATATYPE_INT8:
+            OPAL_DATATYPE_MYUNPACK(INT8);
+            break;
+        case OPAL_DATATYPE_INT16:
+            OPAL_DATATYPE_MYUNPACK(INT16);
+            break;
+        case OPAL_DATATYPE_UINT1:
+            OPAL_DATATYPE_MYUNPACK(UINT1);
+            break;
+        case OPAL_DATATYPE_UINT2:
+            OPAL_DATATYPE_MYUNPACK(UINT2);
+            break;
+        case OPAL_DATATYPE_UINT4:
+            OPAL_DATATYPE_MYUNPACK(UINT4);
+            break;
+        case OPAL_DATATYPE_UINT8:
+            OPAL_DATATYPE_MYUNPACK(UINT8);
+            break;
+        case OPAL_DATATYPE_UINT16:
+            OPAL_DATATYPE_MYUNPACK(UINT16);
+            break;
+        case OPAL_DATATYPE_FLOAT2:
+            OPAL_DATATYPE_MYUNPACK(FLOAT2);
+            break;
+        case OPAL_DATATYPE_FLOAT4:
+            OPAL_DATATYPE_MYUNPACK(FLOAT4);
+            break;
+        case OPAL_DATATYPE_FLOAT8:
+            OPAL_DATATYPE_MYUNPACK(FLOAT8);
+            break;
+        case OPAL_DATATYPE_FLOAT12:
+            OPAL_DATATYPE_MYUNPACK(FLOAT12);
+            break;
+        case OPAL_DATATYPE_FLOAT16:
+            OPAL_DATATYPE_MYUNPACK(FLOAT16);
+            break;
+        case OPAL_DATATYPE_SHORT_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(SHORT_FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_LONG_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYUNPACK(LONG_DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_BOOL:
+            OPAL_DATATYPE_MYUNPACK(BOOL);
+            break;
+        case OPAL_DATATYPE_WCHAR:
+            OPAL_DATATYPE_MYUNPACK(WCHAR);
+            break;
+        default:
+            return OPAL_ERROR;
+    }
+    if (!success) {
+        return OPAL_ERROR;
+    }
+
+    *rtn_src  = src;
+    *rtn_dest = dest;
+    return OPAL_SUCCESS;
+}
+
+static inline int
+opal_datatype_pack_predefined_element( unsigned char** rtn_src,
+                                unsigned char** rtn_dest,
+                                size_t cando_count,
+                                const ddt_elem_desc_t* elem)
+{
+    size_t stride; // elem's extent but in terms of count rather than bytes
+    size_t blocklen;
+    int id;
+    int align;
+
+    id = elem->common.type;
+    blocklen = elem->blocklen;
+    stride = elem->extent / opal_datatype_basicDatatypes[id]->size;
+    align = opal_datatype_basicDatatypes[id]->align;
+
+    unsigned char *src = *rtn_src;
+    unsigned char *dest = *rtn_dest;
+
+#if OPAL_CUDA_SUPPORT
+    if (opal_cuda_check_bufs(dest, src)) {
+        return OPAL_ERROR;
+    }
+    if (elem->count == 2 && cando_count >= blocklen &&
+       (opal_cuda_check_bufs(dest, src + elem->extent)))
+    {
+        return OPAL_ERROR;
+    }
+#endif
+  if ((uintptr_t)src % align  ||
+      (uintptr_t)dest % align ||
+      (elem->extent % align && cando_count > blocklen))
+  {
+      return OPAL_ERROR;
+  }
+
+#define OPAL_DATATYPE_MYPACK(NAME)                 \
+    do {                                           \
+        OPAL_DATATYPE_HANDLE_ ## NAME(             \
+            OPAL_DATATYPE_MYPACK_AVAILABLE,        \
+            OPAL_DATATYPE_MYPACK_NOTAVAIL, 0);     \
+    } while (0)
+
+#define OPAL_DATATYPE_MYPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
+    do { \
+        OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
+        success = true; \
+    } while (0)
+
+#define OPAL_DATATYPE_MYPACK_NOTAVAIL(NAME, unused) \
+    do { \
+        success = false; \
+    } while (0)
+
+    bool success = false;
+    switch(id) {
+        case OPAL_DATATYPE_INT1:
+            // The below macro should expand to
+            // OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, int8_t);
+            // by using OPAL_DATATYPE_HANDLE_* where it finds that INT1 means int8_t etc
+            OPAL_DATATYPE_MYPACK(INT1);
+            break;
+        case OPAL_DATATYPE_INT2:
+            OPAL_DATATYPE_MYPACK(INT2);
+            break;
+        case OPAL_DATATYPE_INT4:
+            OPAL_DATATYPE_MYPACK(INT4);
+            break;
+        case OPAL_DATATYPE_INT8:
+            OPAL_DATATYPE_MYPACK(INT8);
+            break;
+        case OPAL_DATATYPE_INT16:
+            OPAL_DATATYPE_MYPACK(INT16);
+            break;
+        case OPAL_DATATYPE_UINT1:
+            OPAL_DATATYPE_MYPACK(UINT1);
+            break;
+        case OPAL_DATATYPE_UINT2:
+            OPAL_DATATYPE_MYPACK(UINT2);
+            break;
+        case OPAL_DATATYPE_UINT4:
+            OPAL_DATATYPE_MYPACK(UINT4);
+            break;
+        case OPAL_DATATYPE_UINT8:
+            OPAL_DATATYPE_MYPACK(UINT8);
+            break;
+        case OPAL_DATATYPE_UINT16:
+            OPAL_DATATYPE_MYPACK(UINT16);
+            break;
+        case OPAL_DATATYPE_FLOAT2:
+            OPAL_DATATYPE_MYPACK(FLOAT2);
+            break;
+        case OPAL_DATATYPE_FLOAT4:
+            OPAL_DATATYPE_MYPACK(FLOAT4);
+            break;
+        case OPAL_DATATYPE_FLOAT8:
+            OPAL_DATATYPE_MYPACK(FLOAT8);
+            break;
+        case OPAL_DATATYPE_FLOAT12:
+            OPAL_DATATYPE_MYPACK(FLOAT12);
+            break;
+        case OPAL_DATATYPE_FLOAT16:
+            OPAL_DATATYPE_MYPACK(FLOAT16);
+            break;
+        case OPAL_DATATYPE_SHORT_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYPACK(SHORT_FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_FLOAT_COMPLEX:
+            OPAL_DATATYPE_MYPACK(FLOAT_COMPLEX);
+            break;
+        case OPAL_DATATYPE_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYPACK(DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_LONG_DOUBLE_COMPLEX:
+            OPAL_DATATYPE_MYPACK(LONG_DOUBLE_COMPLEX);
+            break;
+        case OPAL_DATATYPE_BOOL:
+            OPAL_DATATYPE_MYPACK(BOOL);
+            break;
+        case OPAL_DATATYPE_WCHAR:
+            OPAL_DATATYPE_MYPACK(WCHAR);
+            break;
+        default:
+            return OPAL_ERROR;
+    }
+    if (!success) {
+        return OPAL_ERROR;
+    }
+
+    *rtn_src  = src;
+    *rtn_dest = dest;
+    return OPAL_SUCCESS;
+}
+#endif
diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
index 79068729a14b595b27a1d38d7e82502e5087ca2f..a786a2fc7e9f508f6c05e412b3180a9114e99c60 100644
--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@@ -7,6 +7,7 @@
  * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2017-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2020-2021 IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -18,6 +19,7 @@
 #define OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
+#include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
 
 #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
 /* Make use of existing macro to do CUDA style memcpy */
@@ -103,6 +105,13 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
     /* premptively update the number of COUNT we will return. */
     *(COUNT) -= cando_count;
 
+    if( _elem->blocklen < 9 ) {
+        if(OPAL_LIKELY(OPAL_SUCCESS == opal_datatype_unpack_predefined_element(&_packed, &_memory, cando_count, _elem))) {
+            goto update_and_return;
+        }
+        /* else unrecognized _elem->common.type, use the memcpy path */
+    }
+
     if( 1 == _elem->blocklen ) {  /* Do as many full blocklen as possible */
         for(; cando_count > 0; cando_count--) {
             OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,