Merge pull request #4143 from aravindksg/psm2_cuda

Add support for GPU buffers for PSM2 MTL
2017-09-01 21:09:55 -07:00 · 2017-09-01 21:09:55 -07:00 · c1ce233eaf
--- a/ompi/mca/mtl/mtl.h
+++ b/ompi/mca/mtl/mtl.h
@ -5,6 +5,7 @@
 * Copyright (c) 2012      Sandia National Laboratories.  All rights reserved.
 * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
 *                         reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -61,6 +62,9 @@ typedef struct mca_mtl_request_t mca_mtl_request_t;
 * MTL module flags
 */
 #define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
+#if OPAL_CUDA_SUPPORT
+#define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
+#endif

 /**
 * Initialization routine for MTL component
--- a/ompi/mca/mtl/psm2/mtl_psm2.c
+++ b/ompi/mca/mtl/psm2/mtl_psm2.c
@ -11,7 +11,7 @@
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006      QLogic Corporation. All rights reserved.
- * Copyright (c) 2013-2015 Intel, Inc. All rights reserved
+ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
 * Copyright (c) 2014      Los Alamos National Security, LLC. All rights
 *                         reserved.
 * Copyright (c) 2016      Research Organization for Information Science
@ -100,6 +100,9 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
    char *generated_key;
    char env_string[256];
    int rc;
+#if OPAL_CUDA_SUPPORT
+    char *cuda_env;
+#endif

    generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports");
    memset(uu, 0, sizeof(psm2_uuid_t));
@ -173,6 +176,15 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
    /* register the psm2 progress function */
    opal_progress_register(ompi_mtl_psm2_progress);

+#if OPAL_CUDA_SUPPORT
+    ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
+
+    cuda_env = getenv("PSM2_CUDA");
+    if (!cuda_env || ( strcmp(cuda_env, "0") == 0) )
+        opal_output(0, "Warning: If running with device buffers, there is a"
+                    " chance the application might fail. Try setting PSM2_CUDA=1.\n");
+#endif
+
    return OMPI_SUCCESS;
 }

--- a/ompi/mca/pml/cm/pml_cm.h
+++ b/ompi/mca/pml/cm/pml_cm.h
@ -6,6 +6,7 @@
 *                         reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -79,6 +80,7 @@ mca_pml_cm_irecv_init(void *addr,
                      struct ompi_request_t **request)
 {
    mca_pml_cm_hvy_recv_request_t *recvreq;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    ompi_proc_t* ompi_proc;
 #endif
@ -87,7 +89,7 @@ mca_pml_cm_irecv_init(void *addr,
    if( OPAL_UNLIKELY(NULL == recvreq) ) return OMPI_ERR_OUT_OF_RESOURCE;

    MCA_PML_CM_HVY_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, tag, src,
-                                     datatype, addr, count, true);
+                                     datatype, addr, count, flags, true);

    *request = (ompi_request_t*) recvreq;

@ -104,6 +106,7 @@ mca_pml_cm_irecv(void *addr,
                 struct ompi_request_t **request)
 {
    int ret;
+    uint32_t flags = 0;
    mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    ompi_proc_t* ompi_proc = NULL;
@ -118,7 +121,8 @@ mca_pml_cm_irecv(void *addr,
                                      src,
                                      datatype,
                                      addr,
-                                      count);
+                                      count,
+                                      flags);

    MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret);

@ -145,6 +149,7 @@ mca_pml_cm_recv(void *addr,
                ompi_status_public_t * status)
 {
    int ret;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    ompi_proc_t *ompi_proc;
 #endif
@ -173,20 +178,24 @@ mca_pml_cm_recv(void *addr,
        ompi_proc = ompi_comm_peer_lookup( comm, src );
    }

+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
    opal_convertor_copy_and_prepare_for_recv(
 	ompi_proc->super.proc_convertor,
 		&(datatype->super),
 		count,
 		addr,
-		0,
+		flags,
 		&convertor );
 #else
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
    opal_convertor_copy_and_prepare_for_recv(
 	ompi_mpi_local_convertor,
 		&(datatype->super),
 		count,
 		addr,
-		0,
+		flags,
 		&convertor );
 #endif

@ -222,6 +231,7 @@ mca_pml_cm_isend_init(const void* buf,
                        ompi_request_t** request)
 {
    mca_pml_cm_hvy_send_request_t *sendreq;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    ompi_proc_t* ompi_proc;
 #endif
@ -230,7 +240,7 @@ mca_pml_cm_isend_init(const void* buf,
    if (OPAL_UNLIKELY(NULL == sendreq)) return OMPI_ERR_OUT_OF_RESOURCE;

    MCA_PML_CM_HVY_SEND_REQUEST_INIT(sendreq, ompi_proc, comm, tag, dst,
-                                     datatype, sendmode, true, false, buf, count);
+                                     datatype, sendmode, true, false, buf, count, flags);

    /* Work around a leak in start by marking this request as complete. The
     * problem occured because we do not have a way to differentiate an
@ -254,6 +264,7 @@ mca_pml_cm_isend(const void* buf,
                   ompi_request_t** request)
 {
    int ret;
+    uint32_t flags = 0;

    if(sendmode == MCA_PML_BASE_SEND_BUFFERED ) {
        mca_pml_cm_hvy_send_request_t* sendreq;
@ -274,7 +285,8 @@ mca_pml_cm_isend(const void* buf,
                                         false,
                                         false,
                                         buf,
-                                         count);
+                                         count,
+                                         flags);

        MCA_PML_CM_HVY_SEND_REQUEST_START( sendreq, ret);

@ -296,7 +308,8 @@ mca_pml_cm_isend(const void* buf,
                                          datatype,
                                          sendmode,
                                          buf,
-                                          count);
+                                          count,
+                                          flags);

        MCA_PML_CM_THIN_SEND_REQUEST_START(
                                           sendreq,
@ -324,6 +337,7 @@ mca_pml_cm_send(const void *buf,
                ompi_communicator_t* comm)
 {
    int ret = OMPI_ERROR;
+    uint32_t flags = 0;
    ompi_proc_t * ompi_proc;

    if(sendmode == MCA_PML_BASE_SEND_BUFFERED) {
@ -342,7 +356,8 @@ mca_pml_cm_send(const void *buf,
                                         false,
                                         false,
                                         buf,
-                                         count);
+                                         count,
+                                         flags);
        MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            MCA_PML_CM_HVY_SEND_REQUEST_RETURN(sendreq);
@ -368,9 +383,12 @@ mca_pml_cm_send(const void *buf,
 #endif
 	{
 		ompi_proc = ompi_comm_peer_lookup(comm, dst);
+
+                MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
 		opal_convertor_copy_and_prepare_for_send(
 		ompi_proc->super.proc_convertor,
-			&datatype->super, count, buf, 0,
+			&datatype->super, count, buf, flags,
 			&convertor);
 	}

@ -459,6 +477,7 @@ mca_pml_cm_imrecv(void *buf,
                  struct ompi_request_t **request)
 {
    int ret;
+    uint32_t flags = 0;
    mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    ompi_proc_t* ompi_proc;
@ -474,7 +493,8 @@ mca_pml_cm_imrecv(void *buf,
                                      (*message)->peer,
                                      datatype,
                                      buf,
-                                      count);
+                                      count,
+                                      flags);

    MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret);

@ -491,6 +511,7 @@ mca_pml_cm_mrecv(void *buf,
                 ompi_status_public_t* status)
 {
    int ret;
+    uint32_t flags = 0;
    mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    ompi_proc_t* ompi_proc;
@ -506,7 +527,8 @@ mca_pml_cm_mrecv(void *buf,
                                      (*message)->peer,
                                      datatype,
                                      buf,
-                                      count);
+                                      count,
+                                      flags);

    MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq,
                                               message, ret);
--- a/ompi/mca/pml/cm/pml_cm_recvreq.h
+++ b/ompi/mca/pml/cm/pml_cm_recvreq.h
@ -13,6 +13,7 @@
 * Copyright (c) 2012      Sandia National Laboratories.  All rights reserved.
 * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
 *                         reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -92,7 +93,8 @@ do {                                                                           \
                                           src,                         \
                                           datatype,                    \
                                           addr,                        \
-                                           count )                      \
+                                           count,                       \
+					   flags )                      \
 do {                                                                    \
    OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false);            \
    (request)->req_base.req_ompi.req_mpi_object.comm = comm;            \
@ -108,12 +110,13 @@ do {                                                                    \
    } else {                                                            \
        ompi_proc = ompi_comm_peer_lookup( comm, src );                 \
    }                                                                   \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_recv(                           \
                                  ompi_proc->super.proc_convertor,      \
                                  &(datatype->super),                   \
                                  count,                                \
                                  addr,                                 \
-                                  0,                                    \
+                                  flags,                                    \
                                  &(request)->req_base.req_convertor ); \
 } while(0)
 #else
@ -123,7 +126,8 @@ do {                                                                    \
                                           src,                         \
                                           datatype,                    \
                                           addr,                        \
-                                           count )                      \
+                                           count,                       \
+					   flags )                      \
 do {                                                                    \
    OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false);            \
    (request)->req_base.req_ompi.req_mpi_object.comm = comm;            \
@ -134,12 +138,13 @@ do {                                                                    \
    OBJ_RETAIN(comm);                                                   \
    OMPI_DATATYPE_RETAIN(datatype);                                     \
                                                                        \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_recv(                           \
        ompi_mpi_local_convertor,                                       \
        &(datatype->super),                                             \
        count,                                                          \
        addr,                                                           \
-        0,                                                              \
+        flags,                                                              \
        &(request)->req_base.req_convertor );                           \
 } while(0)
 #endif
@ -153,6 +158,7 @@ do {                                                                    \
                                          datatype,                     \
                                          addr,                         \
                                          count,                        \
+					  flags,                        \
                                          persistent)                   \
 do {                                                                    \
    OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent);       \
@ -173,12 +179,13 @@ do {                                                                    \
    } else {                                                            \
        ompi_proc = ompi_comm_peer_lookup( comm, src );                 \
    }                                                                   \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_recv(                           \
                                  ompi_proc->super.proc_convertor,      \
                                  &(datatype->super),                   \
                                  count,                                \
                                  addr,                                 \
-                                  0,                                    \
+                                  flags,                                \
                                  &(request)->req_base.req_convertor ); \
 } while(0)
 #else
@ -190,6 +197,7 @@ do {                                                                    \
                                          datatype,                     \
                                          addr,                         \
                                          count,                        \
+					  flags,                        \
                                          persistent)                   \
 do {                                                                    \
    OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent);       \
@ -205,12 +213,13 @@ do {                                                                    \
    OBJ_RETAIN(comm);                                                   \
    OMPI_DATATYPE_RETAIN(datatype);                                     \
                                                                        \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_recv(                           \
        ompi_mpi_local_convertor,                                       \
        &(datatype->super),                                             \
        count,                                                          \
        addr,                                                           \
-        0,                                                              \
+        flags,                                                              \
        &(request)->req_base.req_convertor );                           \
 } while(0)
 #endif
--- a/ompi/mca/pml/cm/pml_cm_request.h
+++ b/ompi/mca/pml/cm/pml_cm_request.h
@ -9,6 +9,7 @@
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -53,4 +54,20 @@ struct mca_pml_cm_request_t {
 typedef struct mca_pml_cm_request_t mca_pml_cm_request_t;
 OBJ_CLASS_DECLARATION(mca_pml_cm_request_t);

+/*
+ * Avoid CUDA convertor inits only for contiguous memory and if indicated by
+ * the MTL. For non-contiguous memory, do not skip CUDA convertor init phases.
+ */
+#if OPAL_CUDA_SUPPORT
+#define MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count)            \
+    {                                                                           \
+        if (opal_datatype_is_contiguous_memory_layout(&datatype->super, count)  \
+            && (ompi_mtl->mtl_flags & MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE)) {   \
+            flags |= CONVERTOR_SKIP_CUDA_INIT;                                  \
+        }                                                                       \
+    }
+#else
+#define MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count)
+#endif
+
 #endif
--- a/ompi/mca/pml/cm/pml_cm_sendreq.h
+++ b/ompi/mca/pml/cm/pml_cm_sendreq.h
@ -14,6 +14,7 @@
 *                         reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -125,18 +126,20 @@ do {                                                                    \
                                            datatype,                   \
                                            sendmode,                   \
                                            buf,                        \
-                                            count)                      \
+                                            count,                      \
+					    flags )                     \
 {                                                                       \
    OBJ_RETAIN(comm);                                                   \
    OMPI_DATATYPE_RETAIN(datatype);                                     \
    (req_send)->req_base.req_comm = comm;                               \
    (req_send)->req_base.req_datatype = datatype;                       \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_send(                           \
                                             ompi_proc->super.proc_convertor, \
                                             &(datatype->super),        \
                                             count,                     \
                                             buf,                       \
-                                             0,                         \
+                                             flags,                         \
                                             &(req_send)->req_base.req_convertor ); \
    (req_send)->req_base.req_ompi.req_mpi_object.comm = comm;           \
    (req_send)->req_base.req_ompi.req_status.MPI_SOURCE =               \
@ -154,18 +157,20 @@ do {                                                                    \
                                            datatype,                   \
                                            sendmode,                   \
                                            buf,                        \
-                                            count)                      \
+                                            count,                      \
+					    flags )                     \
 {                                                                       \
    OBJ_RETAIN(comm);                                                   \
    OMPI_DATATYPE_RETAIN(datatype);                                     \
    (req_send)->req_base.req_comm = comm;                               \
    (req_send)->req_base.req_datatype = datatype;                       \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_send(                           \
        ompi_mpi_local_convertor,                                       \
        &(datatype->super),                                             \
        count,                                                          \
        buf,                                                            \
-        0,                                                              \
+        flags,                                                              \
        &(req_send)->req_base.req_convertor );                          \
    (req_send)->req_base.req_ompi.req_mpi_object.comm = comm;           \
    (req_send)->req_base.req_ompi.req_status.MPI_SOURCE =               \
@ -185,18 +190,20 @@ do {                                                                    \
                                            datatype,                   \
                                            sendmode,                   \
                                            buf,                        \
-                                            count)                      \
+                                            count,                      \
+					    flags )                     \
 {                                                                       \
    OBJ_RETAIN(comm);                                                   \
    OMPI_DATATYPE_RETAIN(datatype);                                     \
    (req_send)->req_base.req_comm = comm;                               \
    (req_send)->req_base.req_datatype = datatype;                       \
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);       \
    opal_convertor_copy_and_prepare_for_send(                           \
                                             ompi_proc->super.proc_convertor, \
                                             &(datatype->super),        \
                                             count,                     \
                                             buf,                       \
-                                             0,                         \
+                                             flags,                         \
                                             &(req_send)->req_base.req_convertor ); \
    (req_send)->req_base.req_ompi.req_mpi_object.comm = comm;           \
    (req_send)->req_base.req_ompi.req_status.MPI_SOURCE =               \
@ -215,7 +222,8 @@ do {                                                                    \
                                            datatype,                   \
                                            sendmode,                   \
                                            buf,                        \
-                                            count)                      \
+                                            count,                      \
+					    flags )                     \
 {                                                                       \
    OBJ_RETAIN(comm);                                                   \
    OMPI_DATATYPE_RETAIN(datatype);                                     \
@ -235,12 +243,13 @@ do {                                                                    \
        (req_send)->req_base.req_convertor.count      = count;          \
        (req_send)->req_base.req_convertor.pDesc      = &datatype->super; \
    } else {                                                            \
+        MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);   \
        opal_convertor_copy_and_prepare_for_send(                       \
            ompi_mpi_local_convertor,                                   \
            &(datatype->super),                                         \
            count,                                                      \
            buf,                                                        \
-            0,                                                          \
+            flags,                                                          \
            &(req_send)->req_base.req_convertor );                      \
    }                                                                   \
    (req_send)->req_base.req_ompi.req_mpi_object.comm = comm;           \
@ -263,7 +272,8 @@ do {                                                                    \
                                          persistent,                   \
                                          blocking,                     \
                                          buf,                          \
-                                          count)                        \
+                                          count,                        \
+					  flags )                       \
    do {                                                                \
        OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi),       \
                          persistent);                                  \
@ -278,7 +288,8 @@ do {                                                                    \
                                             datatype,                  \
                                             sendmode,                  \
                                             buf,                       \
-                                             count);                    \
+                                             count,                     \
+					     flags )                    \
        opal_convertor_get_packed_size(                                 \
                                       &sendreq->req_send.req_base.req_convertor, \
                                       &sendreq->req_count );           \
@ -297,7 +308,8 @@ do {                                                                    \
                                           datatype,                    \
                                           sendmode,                    \
                                           buf,                         \
-                                           count)                       \
+                                           count,                       \
+					   flags )                      \
    do {                                                                \
        OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi),       \
                          false);                                       \
@ -308,7 +320,8 @@ do {                                                                    \
                                             datatype,                  \
                                             sendmode,                  \
                                             buf,                       \
-                                             count);                    \
+                                             count,                     \
+                                             flags);                    \
        sendreq->req_send.req_base.req_pml_complete = false;            \
    } while(0)

--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@ -14,6 +14,7 @@
 * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * Copyright (c) 2013-2017 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -569,7 +570,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,

    convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
+    if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
+        mca_cuda_convertor_init(convertor, pUserBuf);
+    }
 #endif

    assert(! (convertor->flags & CONVERTOR_SEND));
@ -607,7 +610,9 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
 {
    convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
+    if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
+        mca_cuda_convertor_init(convertor, pUserBuf);
+    }
 #endif

    OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@ -14,6 +14,7 @@
 * Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
 * Copyright (c) 2017      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -55,6 +56,7 @@ BEGIN_C_DECLS
 #define CONVERTOR_COMPLETED        0x08000000
 #define CONVERTOR_CUDA_UNIFIED     0x10000000
 #define CONVERTOR_HAS_REMOTE_SIZE  0x20000000
+#define CONVERTOR_SKIP_CUDA_INIT   0x40000000

 union dt_elem_desc;
 typedef struct opal_convertor_t opal_convertor_t;