openmpi/ompi/mca/pml/ob1/pml_ob1_cuda.c

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2008 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
 * Copyright (c) 2012-2013 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */


#include "ompi_config.h"
#include "opal/prefetch.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/mpool/mpool.h" 
#include "pml_ob1.h"
#include "pml_ob1_hdr.h"
#include "pml_ob1_rdmafrag.h"
#include "pml_ob1_recvreq.h"
#include "pml_ob1_sendreq.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/memchecker.h"

size_t mca_pml_ob1_rdma_cuda_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_ob1_com_btl_t* rdma_btls);

int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                  mca_btl_base_module_t* btl);

void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
                                      ompi_proc_t* errproc, char* btlinfo);

/**
 * Handle the CUDA buffer.
 */
int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                        mca_bml_base_btl_t* bml_btl,
                                        size_t size) {
    int rc;
#if OPAL_CUDA_SUPPORT_41
#if OPAL_CUDA_GDR_SUPPORT
    /* With some BTLs, switch to RNDV from RGET at large messages */
    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && 
        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
    }
#endif /* OPAL_CUDA_GDR_SUPPORT */    

    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
        unsigned char *base;
        opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
        /* Set flag back */
        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
        if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
                                                                           sendreq->req_endpoint,
                                                                           base,
                                                                           sendreq->req_send.req_bytes_packed,
                                                                           sendreq->req_rdma))) {
            rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                     sendreq->req_send.req_bytes_packed);
            if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
                mca_pml_ob1_free_rdma_resources(sendreq);
            }
        } else {
            if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
            } else {
                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
            }
        }
    } else {
        /* Do not send anything with first rendezvous message as copying GPU
         * memory into RNDV message is expensive. */
        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
    }
#else
    /* Just do the rendezvous but set initial data to be sent to zero */
    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
#endif /* OPAL_CUDA_SUPPORT_41 */
    return rc;
}

    
size_t mca_pml_ob1_rdma_cuda_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_ob1_com_btl_t* rdma_btls)
{
    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
    double weight_total = 0;
    int num_btls_used = 0, n;

    /* shortcut when there are no rdma capable btls */
    if(num_btls == 0) {
        return 0;
    }

    /* check to see if memory is registered */        
    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
            n++) {
        mca_bml_base_btl_t* bml_btl =
            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);

        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
            mca_mpool_base_registration_t* reg = NULL;
            mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;

            if( NULL != btl_mpool ) {
                /* register the memory */
                btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, &reg);
            }

            if(NULL == reg)
                continue;

            rdma_btls[num_btls_used].bml_btl = bml_btl;
            rdma_btls[num_btls_used].btl_reg = reg;
            weight_total += bml_btl->btl_weight;
            num_btls_used++;
        }
    }

    /* if we don't use leave_pinned and all BTLs that already have this memory
     * registered amount to less then half of available bandwidth - fall back to
     * pipeline protocol */
    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
        return 0;

    mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
                                     weight_total);

    return num_btls_used;
}

int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                  mca_btl_base_module_t* btl) 
{
    mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)rreq;
    mca_bml_base_endpoint_t* bml_endpoint = 
        (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
    mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_send, btl);

    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
        (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
        recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
            return true;
        } else {
            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
            return false;
        }
    }
    return true;
}

/*
 * This function enables us to start using RDMA get protocol with GPU buffers.
 * We do this by adjusting the flags in the BML structure.  This is not the
 * best thing, but this may go away if CUDA IPC is supported everywhere in the
 * future. */
void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
                                      ompi_proc_t* errproc, char* btlinfo)
{ 
    mca_bml_base_endpoint_t* ep;
    int btl_verbose_stream = 0;
    int i;

    assert(NULL != errproc);
    assert(NULL != errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
    if (NULL != btlinfo) {
        btl_verbose_stream = *(int *)btlinfo;
    }
    ep = (mca_bml_base_endpoint_t*)errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];

    /* Find the corresponding bml and adjust the flag to support CUDA get */
    for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
        if( ep->btl_send.bml_btls[i].btl == btl ) {
            ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
            opal_output_verbose(5, btl_verbose_stream,
                        "BTL %s: rank=%d enabling CUDA IPC "
                        "to rank=%d on node=%s \n",
                        btl->btl_component->btl_version.mca_component_name,
                        OMPI_PROC_MY_NAME->vpid,
                        errproc->proc_name.vpid,
                        errproc->proc_hostname);
        }
    }
}
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`/*`
			`* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana`
			`* University Research and Technology`
			`* Corporation. All rights reserved.`
			`* Copyright (c) 2004-2008 The University of Tennessee and The University`
			`* of Tennessee Research Foundation. All rights`
			`* reserved.`
			`* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,`
			`* University of Stuttgart. All rights reserved.`
			`* Copyright (c) 2004-2005 The Regents of the University of California.`
			`* All rights reserved.`
			`* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.`
			`* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.`
Per this RFC from October 8, 2013 and as discuessed in telecon. http://www.open-mpi.org/community/lists/devel/2013/10/13072.php Add support for pinning GPU Direct RDMA in openib BTL for better small message latency of GPU buffers. Note that none of this is compiled in unless CUDA-aware support is requested. This commit was SVN r29680. 2013-11-13 17:22:39 +04:00			`* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.`
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
			`* $HEADER$`
			`*/`


			`#include "ompi_config.h"`
			`#include "opal/prefetch.h"`
			`#include "ompi/constants.h"`
			`#include "ompi/mca/pml/pml.h"`
			`#include "ompi/mca/btl/btl.h"`
			`#include "ompi/mca/mpool/mpool.h"`
			`#include "pml_ob1.h"`
			`#include "pml_ob1_hdr.h"`
			`#include "pml_ob1_rdmafrag.h"`
			`#include "pml_ob1_recvreq.h"`
			`#include "pml_ob1_sendreq.h"`
			`#include "ompi/mca/bml/base/base.h"`
			`#include "ompi/memchecker.h"`

			`size_t mca_pml_ob1_rdma_cuda_btls(`
			`mca_bml_base_endpoint_t* bml_endpoint,`
			`unsigned char* base,`
			`size_t size,`
			`mca_pml_ob1_com_btl_t* rdma_btls);`

			`int mca_pml_ob1_cuda_need_buffers(void * rreq,`
			`mca_btl_base_module_t* btl);`

Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00			`void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,`
			`ompi_proc_t* errproc, char* btlinfo);`

New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`/**`
			`* Handle the CUDA buffer.`
			`*/`
			`int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,`
			`mca_bml_base_btl_t* bml_btl,`
			`size_t size) {`
			`int rc;`
Remove redundant macro. This was from reviewed of earlier ticket. Fixes trac:3878. Reviewed by jsquyres. This commit was SVN r29581. The following Trac tickets were found above: Ticket 3878 --> https://svn.open-mpi.org/trac/ompi/ticket/3878 2013-11-01 16:19:40 +04:00			`#if OPAL_CUDA_SUPPORT_41`
Chnage some CUDA configure code and macro names per review request by jsquyres in ticket #3880. Functionally, nothing changes. This commit was SVN r29815. 2013-12-06 18:35:10 +04:00			`#if OPAL_CUDA_GDR_SUPPORT`
Per this RFC from October 8, 2013 and as discuessed in telecon. http://www.open-mpi.org/community/lists/devel/2013/10/13072.php Add support for pinning GPU Direct RDMA in openib BTL for better small message latency of GPU buffers. Note that none of this is compiled in unless CUDA-aware support is requested. This commit was SVN r29680. 2013-11-13 17:22:39 +04:00			`/* With some BTLs, switch to RNDV from RGET at large messages */`
			`if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&`
			`(sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {`
			`return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);`
			`}`
Chnage some CUDA configure code and macro names per review request by jsquyres in ticket #3880. Functionally, nothing changes. This commit was SVN r29815. 2013-12-06 18:35:10 +04:00			`#endif /* OPAL_CUDA_GDR_SUPPORT */`
Per this RFC from October 8, 2013 and as discuessed in telecon. http://www.open-mpi.org/community/lists/devel/2013/10/13072.php Add support for pinning GPU Direct RDMA in openib BTL for better small message latency of GPU buffers. Note that none of this is compiled in unless CUDA-aware support is requested. This commit was SVN r29680. 2013-11-13 17:22:39 +04:00
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;`
			`if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {`
			`unsigned char *base;`
			`opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );`
			`/* Set flag back */`
			`sendreq->req_send.req_base.req_convertor.flags \|= CONVERTOR_CUDA;`
			`if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(`
			`sendreq->req_endpoint,`
			`base,`
			`sendreq->req_send.req_bytes_packed,`
			`sendreq->req_rdma))) {`
			`rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,`
			`sendreq->req_send.req_bytes_packed);`
			`if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {`
			`mca_pml_ob1_free_rdma_resources(sendreq);`
			`}`
			`} else {`
			`if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {`
			`rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,`
			`MCA_PML_OB1_HDR_FLAGS_CONTIG);`
			`} else {`
Improve CUDA GPU transfers over openib BTL. Use aynchronous copies. This is RFC that was submitted in July and December of 2012. This commit was SVN r27862. 2013-01-18 02:34:43 +04:00			`rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);`
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`}`
			`}`
			`} else {`
			`/* Do not send anything with first rendezvous message as copying GPU`
			`* memory into RNDV message is expensive. */`
			`sendreq->req_send.req_base.req_convertor.flags \|= CONVERTOR_CUDA;`
			`rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);`
			`}`
			`#else`
			`/* Just do the rendezvous but set initial data to be sent to zero */`
			`rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);`
Remove redundant macro. This was from reviewed of earlier ticket. Fixes trac:3878. Reviewed by jsquyres. This commit was SVN r29581. The following Trac tickets were found above: Ticket 3878 --> https://svn.open-mpi.org/trac/ompi/ticket/3878 2013-11-01 16:19:40 +04:00			`#endif /* OPAL_CUDA_SUPPORT_41 */`
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`return rc;`
			`}`



			`size_t mca_pml_ob1_rdma_cuda_btls(`
			`mca_bml_base_endpoint_t* bml_endpoint,`
			`unsigned char* base,`
			`size_t size,`
			`mca_pml_ob1_com_btl_t* rdma_btls)`
			`{`
			`int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);`
			`double weight_total = 0;`
			`int num_btls_used = 0, n;`

			`/* shortcut when there are no rdma capable btls */`
			`if(num_btls == 0) {`
			`return 0;`
			`}`

			`/* check to see if memory is registered */`
			`for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;`
			`n++) {`
			`mca_bml_base_btl_t* bml_btl =`
			`mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);`

			`if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {`
			`mca_mpool_base_registration_t* reg = NULL;`
			`mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;`

			`if( NULL != btl_mpool ) {`
			`/* register the memory */`
Per this RFC from October 8, 2013 and as discuessed in telecon. http://www.open-mpi.org/community/lists/devel/2013/10/13072.php Add support for pinning GPU Direct RDMA in openib BTL for better small message latency of GPU buffers. Note that none of this is compiled in unless CUDA-aware support is requested. This commit was SVN r29680. 2013-11-13 17:22:39 +04:00			`btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, &reg);`
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`}`

			`if(NULL == reg)`
			`continue;`

			`rdma_btls[num_btls_used].bml_btl = bml_btl;`
			`rdma_btls[num_btls_used].btl_reg = reg;`
			`weight_total += bml_btl->btl_weight;`
			`num_btls_used++;`
			`}`
			`}`

			`/* if we don't use leave_pinned and all BTLs that already have this memory`
			`* registered amount to less then half of available bandwidth - fall back to`
			`* pipeline protocol */`
			`if(0 == num_btls_used \|\| (!mca_pml_ob1.leave_pinned && weight_total < 0.5))`
			`return 0;`

			`mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,`
			`weight_total);`

			`return num_btls_used;`
			`}`

			`int mca_pml_ob1_cuda_need_buffers(void * rreq,`
			`mca_btl_base_module_t* btl)`
			`{`
			`mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)rreq;`
Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00			`mca_bml_base_endpoint_t* bml_endpoint =`
Remove the proc_pml and proc_bml fields from ompi_proc_t and replace with a configure-time dynamic allocation of flags. The net result for platforms which only support BTL-based communication is a reduction of 8*nprocs bytes per process. Platforms which support both MTLs and BTLs will not see a space reduction, but will now be able to safely run both the MTL and BTL side-by-side, which will prove useful. This commit was SVN r29100. 2013-08-30 20:54:55 +04:00			`(mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];`
Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00			`mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_send, btl);`

New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&`
Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00			`(bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {`
New btl that extends sm btl to support GPU transfers within a node. Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039. 2012-02-24 06:13:33 +04:00			`recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;`
			`if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {`
			`recvreq->req_recv.req_base.req_convertor.flags \|= CONVERTOR_CUDA;`
			`return true;`
			`} else {`
			`recvreq->req_recv.req_base.req_convertor.flags \|= CONVERTOR_CUDA;`
			`return false;`
			`}`
			`}`
			`return true;`
			`}`

Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00			`/*`
			`* This function enables us to start using RDMA get protocol with GPU buffers.`
			`* We do this by adjusting the flags in the BML structure. This is not the`
			`* best thing, but this may go away if CUDA IPC is supported everywhere in the`
			`* future. */`
			`void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,`
			`ompi_proc_t* errproc, char* btlinfo)`
			`{`
			`mca_bml_base_endpoint_t* ep;`
			`int btl_verbose_stream = 0;`
			`int i;`

			`assert(NULL != errproc);`
Remove the proc_pml and proc_bml fields from ompi_proc_t and replace with a configure-time dynamic allocation of flags. The net result for platforms which only support BTL-based communication is a reduction of 8*nprocs bytes per process. Platforms which support both MTLs and BTLs will not see a space reduction, but will now be able to safely run both the MTL and BTL side-by-side, which will prove useful. This commit was SVN r29100. 2013-08-30 20:54:55 +04:00			`assert(NULL != errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);`
Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00			`if (NULL != btlinfo) {`
			`btl_verbose_stream = (int )btlinfo;`
			`}`
Remove the proc_pml and proc_bml fields from ompi_proc_t and replace with a configure-time dynamic allocation of flags. The net result for platforms which only support BTL-based communication is a reduction of 8*nprocs bytes per process. Platforms which support both MTLs and BTLs will not see a space reduction, but will now be able to safely run both the MTL and BTL side-by-side, which will prove useful. This commit was SVN r29100. 2013-08-30 20:54:55 +04:00			`ep = (mca_bml_base_endpoint_t*)errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];`
Fix support in smcuda btl so it does not blow up when there is no CUDA IPC support between two GPUs. Also make it so CUDA IPC support is added dynamically. Fixes ticket 3531. This commit was SVN r29055. 2013-08-22 01:00:09 +04:00
			`/* Find the corresponding bml and adjust the flag to support CUDA get */`
			`for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {`
			`if( ep->btl_send.bml_btls[i].btl == btl ) {`
			`ep->btl_send.bml_btls[i].btl_flags \|= MCA_BTL_FLAGS_CUDA_GET;`
			`opal_output_verbose(5, btl_verbose_stream,`
			`"BTL %s: rank=%d enabling CUDA IPC "`
			`"to rank=%d on node=%s \n",`
			`btl->btl_component->btl_version.mca_component_name,`
			`OMPI_PROC_MY_NAME->vpid,`
			`errproc->proc_name.vpid,`
			`errproc->proc_hostname);`
			`}`
			`}`
			`}`