Remove redundant macro. This was from reviewed of earlier ticket.

Fixes trac:3878. Reviewed by jsquyres. This commit was SVN r29581. The following Trac tickets were found above: Ticket 3878 --> https://svn.open-mpi.org/trac/ompi/ticket/3878
2013-11-01 12:19:40 +00:00 · 2013-11-01 12:19:40 +00:00 · ee7510b025
--- a/ompi/mca/btl/openib/Makefile.am
+++ b/ompi/mca/btl/openib/Makefile.am
@ -127,7 +127,7 @@ mca_btl_openib_la_SOURCES = $(component_sources)
 mca_btl_openib_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS)
 mca_btl_openib_la_LIBADD = $(btl_openib_LIBS) \
    $(top_ompi_builddir)/ompi/mca/common/verbs/libmca_common_verbs.la
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 mca_btl_openib_la_LIBADD += \
    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
 endif
--- a/ompi/mca/btl/openib/btl_openib.c
+++ b/ompi/mca/btl/openib/btl_openib.c
@ -57,10 +57,10 @@
 #include "ompi/mca/mpool/mpool.h"
 #include "ompi/mca/mpool/grdma/mpool_grdma.h"

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 #include <errno.h>
 #include <sys/types.h>
@ -1296,14 +1296,14 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
    iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
    rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    /* If the convertor is copying the data asynchronously, then record an event
     * that will trigger the callback when it completes.  Mark descriptor as async.*/
    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
        mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag);
        to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    *size = max_data;

--- a/ompi/mca/btl/openib/btl_openib.h
+++ b/ompi/mca/btl/openib/btl_openib.h
@ -311,10 +311,10 @@ struct mca_btl_openib_component_t {
    size_t memalign_threshold;
    void* (*previous_malloc_hook)(size_t __size, const void*);
 #endif
-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    bool cuda_async_send;
    bool cuda_async_recv;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
    bool rroce_enable;
 #endif
--- a/ompi/mca/btl/openib/btl_openib_component.c
+++ b/ompi/mca/btl/openib/btl_openib_component.c
@ -113,12 +113,12 @@ static int btl_openib_component_open(void);
 static int btl_openib_component_close(void);
 static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
 static int btl_openib_component_progress(void);
-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
                                                  mca_btl_openib_endpoint_t *ep,
                                                  mca_btl_base_descriptor_t* des,
                                                  int status);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 /*
 * Local variables
 */
@ -605,7 +605,7 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
                         "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
                         (int) (reg->bound - reg->base + 1), reg->flags));

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    if (reg->flags & MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM) {
        mca_common_cuda_register(base, size,
            openib_reg->base.mpool->mpool_component->mpool_version.mca_component_name);
@ -631,7 +631,7 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
            return OMPI_ERROR;
        }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
        if (reg->flags & MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM) {
            mca_common_cuda_unregister(openib_reg->base.base,
                openib_reg->base.mpool->mpool_component->mpool_version.mca_component_name);
@ -3133,13 +3133,13 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
        /* call registered callback */
        mca_btl_active_message_callback_t* reg;

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
        /* The COPY_ASYNC flag should not be set */
        assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
        reg = mca_btl_base_active_message_trigger + hdr->tag;
        reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
        if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) { 
            /* Since ASYNC flag is set, we know this descriptor is being used 
             * for asynchronous copy and cannot be freed yet. Therefore, set
@ -3149,7 +3149,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
            des->des_cbdata = (void *)ep;
            return OMPI_SUCCESS;
        }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
        if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
            cqp = (hdr->credits >> 11) & 0x0f;
            hdr->credits &= 0x87ff;
@ -3240,7 +3240,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
    return OMPI_SUCCESS;
 }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 /**
 * Called by the PML when the copying of the data out of the fragment
 * is complete.
@ -3316,7 +3316,7 @@ static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
    send_credits(ep, cqp);

 }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
 {
@ -3800,7 +3800,7 @@ static int btl_openib_component_progress(void)
        count += progress_one_device(device);
    }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    /* Check to see if there are any outstanding dtoh CUDA events that
     * have completed.  If so, issue the PML callbacks on the fragments.
     * The only thing that gets completed here are asynchronous copies
@ -3819,7 +3819,7 @@ static int btl_openib_component_progress(void)
    if (count > 0) {
        OPAL_OUTPUT((-1, "btl_openib: DONE with openib progress, count=%d", count));
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    return count;

--- a/ompi/mca/btl/openib/btl_openib_mca.c
+++ b/ompi/mca/btl/openib/btl_openib_mca.c
@ -564,7 +564,7 @@ int btl_openib_register_mca_params(void)
    /* Default to bandwidth auto-detection */
    mca_btl_openib_module.super.btl_bandwidth = 0;
    mca_btl_openib_module.super.btl_latency = 4;
-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    /* Default is enabling CUDA asynchronous send copies */
    CHECK(reg_bool("cuda_async_send", NULL,
                   "Enable or disable CUDA async send copies "
@ -580,7 +580,7 @@ int btl_openib_register_mca_params(void)
    mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
    /* Turn of message coalescing - not sure if it works with GPU buffers */
    mca_btl_openib_component.use_message_coalescing = 0;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    CHECK(mca_btl_base_param_register(
            &mca_btl_openib_component.super.btl_version,
            &mca_btl_openib_module.super));
@ -727,7 +727,7 @@ int btl_openib_verify_mca_params (void)
        mca_btl_openib_component.buffer_alignment = 64;
    }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    if (mca_btl_openib_component.cuda_async_send) {
        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
    } else {
--- a/ompi/mca/btl/smcuda/Makefile.am
+++ b/ompi/mca/btl/smcuda/Makefile.am
@ -51,7 +51,7 @@ mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
 mca_btl_smcuda_la_LIBADD = \
    $(top_ompi_builddir)/ompi/mca/common/sm/libmca_common_sm.la
 mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 mca_btl_smcuda_la_LIBADD += \
    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
 endif
--- a/ompi/mca/btl/smcuda/btl_smcuda.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda.c
@ -51,9 +51,9 @@
 #include "ompi/class/ompi_free_list.h"
 #include "ompi/runtime/ompi_module_exchange.h"
 #include "ompi/mca/btl/btl.h"
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 #include "ompi/mca/mpool/base/base.h"
 #include "ompi/mca/mpool/sm/mpool_sm.h"

@ -91,11 +91,11 @@ mca_btl_smcuda_t mca_btl_smcuda = {
        mca_btl_smcuda_alloc,
        mca_btl_smcuda_free,
        mca_btl_smcuda_prepare_src,
-#if OMPI_CUDA_SUPPORT || OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA
+#if OPAL_CUDA_SUPPORT || OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA
        mca_btl_smcuda_prepare_dst,
 #else
        NULL,
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
        mca_btl_smcuda_send,
        mca_btl_smcuda_sendi,
        NULL,  /* put */
@ -107,10 +107,10 @@ mca_btl_smcuda_t mca_btl_smcuda = {
    }
 };

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* btl,
                                                 struct mca_btl_base_endpoint_t* endpoint);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /*
 * calculate offset of an address from the beginning of a shared memory segment
@ -341,7 +341,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
            return rc;
        }
    }
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* Create a local memory pool that sends handles to the remote
     * side.  Note that the res argument is not really used, but
     * needed to satisfy function signature. */
@ -351,7 +351,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
    if (NULL == smcuda_btl->super.btl_mpool) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    /* it is now safe to free the mpool resources */
    free(res);
@ -478,7 +478,7 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
        return NULL;
    }
 #endif
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    {
        mca_mpool_base_resources_t resources; /* unused, but needed */

@ -489,7 +489,7 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
                                                 NULL,
                                                 &resources);
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    return ep;
 }

@ -543,11 +543,11 @@ int mca_btl_smcuda_add_procs(
            return_code = OMPI_ERROR;
            goto CLEANUP;
        }
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
        peers[proc]->proc_ompi = procs[proc];
        peers[proc]->ipcstate = IPC_INIT;
        peers[proc]->ipctries = 0;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

        n_local_procs++;

@ -795,9 +795,9 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
    size_t max_data = *size;
    int rc;

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    if (0 != reserve) {
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
        if ( reserve + max_data <= mca_btl_smcuda_component.eager_limit ) {
            MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
        } else {
@ -820,7 +820,7 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
            return NULL;
        }
        frag->segment.base.seg_len = reserve + max_data;
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    } else {
        /* Normally, we are here because we have a GPU buffer and we are preparing
         * to send it.  However, we can also be there because we have received a 
@ -851,7 +851,7 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
        frag->segment.memh_seg_len = registration->bound - registration->base + 1;

    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    frag->base.des_src = &(frag->segment.base);
    frag->base.des_src_cnt = 1;
    frag->base.order = MCA_BTL_NO_ORDER;
@ -919,12 +919,12 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
        mca_btl_smcuda_component_progress();
    }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* Initiate setting up CUDA IPC support. */
    if (mca_common_cuda_enabled && (IPC_INIT == endpoint->ipcstate) && mca_btl_smcuda_component.use_cuda_ipc) {
        mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint);
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    /* this check should be unnecessary... turn into an assertion? */
    if( length < mca_btl_smcuda_component.eager_limit ) {
@ -1004,12 +1004,12 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl,
        mca_btl_smcuda_component_progress();
    }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* Initiate setting up CUDA IPC support */
    if (mca_common_cuda_enabled && (IPC_INIT == endpoint->ipcstate) && mca_btl_smcuda_component.use_cuda_ipc) {
        mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint);
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    /* available header space */
    frag->hdr->len = frag->segment.base.seg_len;
@ -1036,7 +1036,7 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl,
     */
    return 0;
 }
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst( 
        struct mca_btl_base_module_t* btl,
        struct mca_btl_base_endpoint_t* endpoint,
@ -1071,10 +1071,10 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
    frag->base.des_flags = flags;
    return &frag->base;
 }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */


-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
                        struct mca_btl_base_endpoint_t* ep,
                        struct mca_btl_base_descriptor_t* descriptor)
@ -1242,7 +1242,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b

 }

-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /**
 *
--- a/ompi/mca/btl/smcuda/btl_smcuda.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda.h
@ -202,12 +202,12 @@ struct mca_btl_smcuda_component_t {
    char *sm_mpool_rndv_file_name;
    char *sm_ctl_file_name;
    char *sm_rndv_file_name;
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    int cuda_ipc_verbose;
    int cuda_ipc_output;
    int use_cuda_ipc;
    int use_cuda_ipc_same_gpu;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
 OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
@ -478,7 +478,7 @@ extern int mca_btl_smcuda_send(
    mca_btl_base_tag_t tag
 );

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 /**
 * Remote get using device memory.
 */
@ -519,7 +519,7 @@ enum ipcState {
    IPC_BAD
 };

-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */


 extern void mca_btl_smcuda_dump(struct mca_btl_base_module_t* btl,
--- a/ompi/mca/btl/smcuda/btl_smcuda_component.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda_component.c
@ -53,10 +53,10 @@
 #include "ompi/mca/common/sm/common_sm.h"
 #include "ompi/mca/btl/base/btl_base_error.h"
 #include "ompi/mca/rte/rte.h"
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "ompi/runtime/params.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@ -163,7 +163,7 @@ static int smcuda_register(void)
    /* default number of extra procs to allow for future growth */
    mca_btl_smcuda_param_register_int("sm_extra_procs", 0, &mca_btl_smcuda_component.sm_extra_procs);

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* Lower priority when CUDA support is not requested */
    if (ompi_mpi_cuda_support) {
        mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
@ -175,9 +175,9 @@ static int smcuda_register(void)
    mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, &mca_btl_smcuda_component.cuda_ipc_verbose);
    mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
    opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
-#else /* OMPI_CUDA_SUPPORT */
+#else /* OPAL_CUDA_SUPPORT */
    mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH-1;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    mca_btl_smcuda.super.btl_eager_limit = 4*1024;
    mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
    mca_btl_smcuda.super.btl_max_send_size = 32*1024;
@ -619,7 +619,7 @@ out:
    return rc;
 }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT

 /**
 * Send a CUDA IPC ACK or NOTREADY message back to the peer.
@ -827,7 +827,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
    }
 }

-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /*
 *  SM component initialization
@ -931,13 +931,13 @@ mca_btl_smcuda_component_init(int *num_btls,
    /* set flag indicating btl not inited */
    mca_btl_smcuda.btl_inited = false;

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* Assume CUDA GET works. */
    mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
    /* Register a smcuda control function to help setup IPC support */
    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */


    return btls;
@ -1065,9 +1065,9 @@ int mca_btl_smcuda_component_progress(void)
                seg.seg_len = hdr->len;
                Frag.base.des_dst_cnt = 1;
                Frag.base.des_dst = &seg;
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
                Frag.hdr = hdr;  /* needed for peer rank in control messages */
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
                reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
                            reg->cbdata);
                /* return the fragment */
@ -1126,7 +1126,7 @@ int mca_btl_smcuda_component_progress(void)
        }
    }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* Check to see if there are any outstanding CUDA events that have
     * completed.  If so, issue the PML callbacks on the fragments.
     */
@ -1149,6 +1149,6 @@ int mca_btl_smcuda_component_progress(void)
        }
        nevents++;
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    return nevents;
 }
--- a/ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
@ -34,9 +34,9 @@ struct mca_btl_base_endpoint_t {
                         *   SMP specfic data structures. */
    int peer_smp_rank;  /**< My peer's SMP process rank.  Used for accessing
                         *   SMP specfic data structures. */
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    mca_mpool_base_module_t *mpool; /**< mpool for remotely registered memory */
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 #if OMPI_ENABLE_PROGRESS_THREADS == 1
    int fifo_fd;        /**< pipe/fifo used to signal endpoint that data is queued */
 #endif
@ -45,11 +45,11 @@ struct mca_btl_base_endpoint_t {
    /** lock for concurrent access to endpoint state */
    opal_mutex_t endpoint_lock;

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    ompi_proc_t *proc_ompi;  /**< Needed for adding CUDA IPC support dynamically */
    enum ipcState ipcstate;  /**< CUDA IPC connection status */
    int ipctries;            /**< Number of times CUDA IPC connect was sent */
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 };

 void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep);
--- a/ompi/mca/btl/smcuda/btl_smcuda_frag.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda_frag.c
@ -37,9 +37,9 @@ static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t*
    frag->base.des_dst = &frag->segment.base;
    frag->base.des_dst_cnt = 1;
    frag->base.des_flags = 0;
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    frag->registration = NULL;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 }

 static void mca_btl_smcuda_frag1_constructor(mca_btl_smcuda_frag_t* frag)
--- a/ompi/mca/btl/smcuda/btl_smcuda_frag.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_frag.h
@ -48,13 +48,13 @@ typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;

 struct mca_btl_smcuda_segment_t {
    mca_btl_base_segment_t base;
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    uint8_t key[128]; /* 64 bytes for CUDA mem handle, 64 bytes for CUDA event handle */
    /** Address of the entire memory handle */
    ompi_ptr_t memh_seg_addr;        
     /** Length in bytes of entire memory handle */
    uint32_t memh_seg_len;           
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_segment_t mca_btl_smcuda_segment_t;

@ -65,9 +65,9 @@ struct mca_btl_smcuda_frag_t {
    mca_btl_base_descriptor_t base;
    mca_btl_smcuda_segment_t segment;
    struct mca_btl_base_endpoint_t *endpoint;
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    struct mca_mpool_base_registration_t *registration;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    size_t size;
    /* pointer written to the FIFO, this is the base of the shared memory region */
    mca_btl_smcuda_hdr_t *hdr;
--- a/ompi/mca/btl/tcp/Makefile.am
+++ b/ompi/mca/btl/tcp/Makefile.am
@ -55,7 +55,7 @@ mcacomponentdir = $(pkglibdir)
 mcacomponent_LTLIBRARIES = $(component)
 mca_btl_tcp_la_SOURCES = $(component_sources)
 mca_btl_tcp_la_LDFLAGS = -module -avoid-version
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 mca_btl_tcp_la_LIBADD = \
    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
 endif
--- a/ompi/mca/btl/tcp/btl_tcp_component.c
+++ b/ompi/mca/btl/tcp/btl_tcp_component.c
@ -69,9 +69,9 @@
 #include "btl_tcp_proc.h"
 #include "btl_tcp_frag.h"
 #include "btl_tcp_endpoint.h" 
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */


 /* 
@ -1084,9 +1084,9 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules,
        return NULL;
    }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    mca_common_cuda_stage_one_init();
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    memcpy(btls, mca_btl_tcp_component.tcp_btls, mca_btl_tcp_component.tcp_num_btls*sizeof(mca_btl_tcp_module_t*));
    *num_btl_modules = mca_btl_tcp_component.tcp_num_btls;
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -21,7 +21,7 @@
 * This file contains various support functions for doing CUDA
 * operations.  Some of the features are only available in CUDA 4.1
 * and later, so some code is conditionalized around the
- * OMPI_CUDA_SUPPORT_41 macro.
+ * OPAL_CUDA_SUPPORT_41 macro.
 */
 #include "ompi_config.h"

@ -81,13 +81,13 @@ struct cudaFunctionTable {
    int (*cuEventDestroy)(CUevent);
    int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
    int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
    int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
    int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
    int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
    int (*cuIpcCloseMemHandle)(CUdeviceptr);
    int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */
    int (*cuCtxGetDevice)(CUdevice *);
    int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
    int (*cuDeviceGet)(CUdevice *, int);
@ -132,7 +132,7 @@ OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,
                   NULL,
                   NULL);

-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
 static int mca_common_cuda_async = 1;

 /* Array of CUDA events to be queried for IPC stream, sending side and
@ -185,7 +185,7 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
 #define CUDA_DUMP_EVTHANDLE(a)
 #endif /* OPAL_ENABLE_DEBUG */

-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */


 /**
@ -269,7 +269,7 @@ int mca_common_cuda_stage_one_init(void)
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &mca_common_cuda_warning);

-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
    /* Use this flag to test async vs sync copies */
    mca_common_cuda_async = 1;
    (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
@ -287,7 +287,7 @@ int mca_common_cuda_stage_one_init(void)
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &cuda_event_max);
-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */

    mca_common_cuda_output = opal_output_open(NULL);
    opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
@ -439,13 +439,13 @@ int mca_common_cuda_stage_one_init(void)
    OMPI_CUDA_DLSYM(libcuda_handle, cuMemFree);
    OMPI_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
    OMPI_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
    OMPI_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */
    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
    OMPI_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
    OMPI_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
@ -527,7 +527,7 @@ static int mca_common_cuda_stage_three_init(void)
        return OMPI_ERROR;
    }

-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
    if (true == mca_common_cuda_enabled) {
        /* Set up an array to store outstanding IPC async copy events */
        cuda_event_ipc_array = NULL;
@ -564,7 +564,7 @@ static int mca_common_cuda_stage_three_init(void)
        }
    }

-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */
    if (true == mca_common_cuda_enabled) {
        /* Set up an array to store outstanding async dtoh events.  Used on the
         * sending side for asynchronous copies. */
@ -782,7 +782,7 @@ void mca_common_cuda_unregister(void *ptr, char *msg) {
    }
 }

-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
 /*
 * Get the memory handle of a local section of memory that can be sent
 * to the remote size so it can access the memory.  This is the
@ -1467,7 +1467,7 @@ static float mydifftime(struct timespec ts_start, struct timespec ts_end) {
 }
 #endif /* CUDA_COMMON_TIMING */

-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */

 /* Routines that get plugged into the opal datatype code */
 static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
@ -1613,7 +1613,7 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
    return 0;
 }

-#if OMPI_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT
+#if OPAL_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT
 int mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg)
 {
    int res;
@ -1657,5 +1657,5 @@ void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg)
    reg->gpu_bufID = bufID;

 }
-#endif /* OMPI_CUDA_SUPPORT_60 */       
+#endif /* OPAL_CUDA_SUPPORT_60 */       

--- a/ompi/mca/common/cuda/common_cuda.h
+++ b/ompi/mca/common/cuda/common_cuda.h
@ -75,10 +75,10 @@ OMPI_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
 OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
 OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
-#if OMPI_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT
+#if OPAL_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT
 OMPI_DECLSPEC int mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
-#endif /* OMPI_CUDA_SUPPORT_60 */
+#endif /* OPAL_CUDA_SUPPORT_60 */
 /**
 * Return:   0 if no packing is required for sending (the upper layer
 *             can use directly the pointer to the contiguous user
--- a/ompi/mca/common/cuda/configure.m4
+++ b/ompi/mca/common/cuda/configure.m4
@ -10,8 +10,8 @@

 #
 # If CUDA support was requested, then build the CUDA support library.
-# This code checks the variable CUDA_SUPPORT which was set earlier in
-# the configure sequence by the opal_configure_options.m4 code.
+# This code checks just makes sure the check was done earlier by the
+# opal_check_cuda.m4 code.
 #

 AC_DEFUN([MCA_ompi_common_cuda_CONFIG],[
@ -20,24 +20,10 @@ AC_DEFUN([MCA_ompi_common_cuda_CONFIG],[
    # make sure that CUDA-aware checks have been done
    AC_REQUIRE([OPAL_CHECK_CUDA])

-    # Use CUDA_SUPPORT which was filled in by the opal configure code.
-    AM_CONDITIONAL([MCA_ompi_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
-    AC_DEFINE_UNQUOTED([OMPI_CUDA_SUPPORT],$CUDA_SUPPORT,
-                       [Whether we want cuda memory registration support in OMPI code])
-    AS_IF([test "x$CUDA_SUPPORT" = "x1"],
+    AS_IF([test "x$OPAL_CUDA_SUPPORT" = "x1"],
          [$1],
          [$2])

-    # Check to see if we have features of CUDA 4.1 available as well.
-    AM_CONDITIONAL([MCA_ompi_cuda_support_41], [test "x$CUDA_SUPPORT_41" = "x1"])
-    AC_DEFINE_UNQUOTED([OMPI_CUDA_SUPPORT_41],$CUDA_SUPPORT_41,
-                       [Whether we want support CUDA 4.1 features])
-
-    # Check to see if we have features of CUDA 6.0 available as well.
-    AM_CONDITIONAL([MCA_ompi_cuda_support_60], [test "x$CUDA_SUPPORT_60" = "x1"])
-    AC_DEFINE_UNQUOTED([OMPI_CUDA_SUPPORT_60],$CUDA_SUPPORT_60,
-                       [Whether we want support CUDA 6.0 features])
-
    # Copy over the includes needed to build CUDA
    common_cuda_CPPFLAGS=$opal_datatype_cuda_CPPFLAGS
    AC_SUBST([common_cuda_CPPFLAGS])
--- a/ompi/mca/mpool/gpusm/Makefile.am
+++ b/ompi/mca/mpool/gpusm/Makefile.am
@ -46,7 +46,7 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_mpool_gpusm_la_SOURCES = $(sources)
 mca_mpool_gpusm_la_LDFLAGS = -module -avoid-version
 mca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS)
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 mca_mpool_gpusm_la_LIBADD += \
    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
 endif
--- a/ompi/mca/mpool/grdma/mpool_grdma_module.c
+++ b/ompi/mca/mpool/grdma/mpool_grdma_module.c
@ -131,7 +131,7 @@ void* mca_mpool_grdma_alloc(mca_mpool_base_module_t *mpool, size_t size,
    if(0 == align)
        align = mca_mpool_base_page_size;

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    /* CUDA cannot handle registering overlapping regions, so make
     * sure each region is page sized and page aligned. */
    align = mca_mpool_base_page_size;
--- a/ompi/mca/mpool/rgpusm/Makefile.am
+++ b/ompi/mca/mpool/rgpusm/Makefile.am
@ -46,7 +46,7 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_mpool_rgpusm_la_SOURCES = $(sources)
 mca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version
 mca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS)
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 mca_mpool_rgpusm_la_LIBADD += \
    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
 endif
--- a/ompi/mca/mpool/sm/Makefile.am
+++ b/ompi/mca/mpool/sm/Makefile.am
@ -44,7 +44,7 @@ mca_mpool_sm_la_SOURCES = $(sources)
 mca_mpool_sm_la_LDFLAGS = -module -avoid-version
 mca_mpool_sm_la_LIBADD = \
    $(top_ompi_builddir)/ompi/mca/common/sm/libmca_common_sm.la
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 mca_mpool_sm_la_LIBADD += \
    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
 endif
--- a/ompi/mca/pml/bfo/Makefile.am
+++ b/ompi/mca/pml/bfo/Makefile.am
@ -53,7 +53,7 @@ bfo_sources  = \
 	pml_bfo_start.c 

 # If we have CUDA support requested, build the CUDA file also
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 bfo_sources += \
    pml_bfo_cuda.c
 endif
--- a/ompi/mca/pml/bfo/pml_bfo_cuda.c
+++ b/ompi/mca/pml/bfo/pml_bfo_cuda.c
@ -50,7 +50,7 @@ int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
                                        mca_bml_base_btl_t* bml_btl,
                                        size_t size) {
    int rc;
-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
        unsigned char *base;
@ -84,7 +84,7 @@ int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
 #else
    /* Just do the rendezvous but set initial data to be sent to zero */
    rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */
    return rc;
 }

--- a/ompi/mca/pml/bfo/pml_bfo_recvreq.c
+++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.c
@ -36,10 +36,10 @@
 #include "opal/util/arch.h"
 #include "ompi/memchecker.h"

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 int mca_pml_bfo_cuda_need_buffers(mca_pml_bfo_recv_request_t* recvreq,
                                  mca_btl_base_module_t* btl);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 #if PML_BFO
 #include "pml_bfo_failover.h"
 #endif /* PML_BFO */
@ -540,15 +540,15 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
     * sender side is already registered. We need to be smarter here, perhaps
     * do couple of RDMA reads */
    if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
        if (mca_pml_bfo_cuda_need_buffers(recvreq, btl)) {
            mca_pml_bfo_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
            return;
        }
-#else /* OMPI_CUDA_SUPPORT */
+#else /* OPAL_CUDA_SUPPORT */
        mca_pml_bfo_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
        return;
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    }
    
    MCA_PML_BFO_RDMA_FRAG_ALLOC(frag);
@ -583,7 +583,7 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
    frag->rdma_btl = btl;
 #endif /* PML_BFO */
    frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
        if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) {
            /* Check to see if this is a CUDA get */
@ -600,12 +600,12 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
            return;
        }
    }
-#else /* OMPI_CUDA_SUPPORT */
+#else /* OPAL_CUDA_SUPPORT */
    if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
        opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
        ompi_rte_abort(-1, NULL);
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    frag->rdma_hdr.hdr_rget = *hdr;
    frag->rdma_req = recvreq;
    frag->rdma_ep = bml_endpoint;
--- a/ompi/mca/pml/bfo/pml_bfo_sendreq.h
+++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.h
@ -319,12 +319,12 @@ mca_pml_bfo_send_request_schedule(mca_pml_bfo_send_request_t* sendreq)
    mca_pml_bfo_send_request_schedule_exclusive(sendreq);
 }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 int mca_pml_bfo_send_request_start_cuda(
    mca_pml_bfo_send_request_t* sendreq, 
    mca_bml_base_btl_t* bml_btl,
    size_t size);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /**
 *  Start the specified request
@ -410,11 +410,11 @@ mca_pml_bfo_send_request_start_btl( mca_pml_bfo_send_request_t* sendreq,
                                                         MCA_PML_BFO_HDR_FLAGS_CONTIG);
            }
        } else {
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
            if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
                return mca_pml_bfo_send_request_start_cuda(sendreq, bml_btl, size);
            }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
            rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
        }
    }
--- a/ompi/mca/pml/ob1/Makefile.am
+++ b/ompi/mca/pml/ob1/Makefile.am
@ -49,7 +49,7 @@ ob1_sources  = \
 	pml_ob1_start.c 

 # If we have CUDA support requested, build the CUDA file also
-if MCA_ompi_cuda_support
+if OPAL_cuda_support
 ob1_sources += \
    pml_ob1_cuda.c
 endif
--- a/ompi/mca/pml/ob1/pml_ob1.c
+++ b/ompi/mca/pml/ob1/pml_ob1.c
@ -79,11 +79,11 @@ mca_pml_ob1_t mca_pml_ob1 = {
    }
 };

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl,
                                      int32_t flags, ompi_proc_t* errproc,
                                      char* btlinfo);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl,
                                int32_t flags, ompi_proc_t* errproc,
@ -732,12 +732,12 @@ void mca_pml_ob1_process_pending_rdma(void)
 void mca_pml_ob1_error_handler(
        struct mca_btl_base_module_t* btl, int32_t flags,
        ompi_proc_t* errproc, char* btlinfo ) { 
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    if (flags & MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC) {
        mca_pml_ob1_cuda_add_ipc_support(btl, flags, errproc, btlinfo);
        return;
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    ompi_rte_abort(-1, NULL);
 }

--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@ -53,7 +53,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                        mca_bml_base_btl_t* bml_btl,
                                        size_t size) {
    int rc;
-#if OMPI_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_41
    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
        unsigned char *base;
@ -87,7 +87,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
 #else
    /* Just do the rendezvous but set initial data to be sent to zero */
    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-#endif /* OMPI_CUDA_SUPPORT_41 */
+#endif /* OPAL_CUDA_SUPPORT_41 */
    return rc;
 }

--- a/ompi/mca/pml/ob1/pml_ob1_progress.c
+++ b/ompi/mca/pml/ob1/pml_ob1_progress.c
@ -22,12 +22,12 @@
 #include "pml_ob1.h"
 #include "pml_ob1_sendreq.h"
 #include "ompi/mca/bml/base/base.h" 
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "ompi/mca/common/cuda/common_cuda.h"
 #include "pml_ob1_recvreq.h"
 #include "ompi/runtime/params.h"
 static void mca_pml_ob1_process_pending_cuda_async_copies(void);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 int mca_pml_ob1_progress(void)
 {
@ -35,9 +35,9 @@ int mca_pml_ob1_progress(void)
    int j, completed_requests = 0;
    bool send_succedded;

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    mca_pml_ob1_process_pending_cuda_async_copies();
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    if( OPAL_LIKELY(0 == queue_length) )
        return 0;
@ -87,7 +87,7 @@ int mca_pml_ob1_progress(void)
    return completed_requests;
 }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 static void mca_pml_ob1_process_pending_cuda_async_copies(void)
 {
    mca_btl_base_descriptor_t *frag;
@ -106,4 +106,4 @@ static void mca_pml_ob1_process_pending_cuda_async_copies(void)
    /* Consider progressing dtoh events here in future */

 }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@ -44,10 +44,10 @@
 #include "pml_ob1_recvreq.h"
 #include "pml_ob1_sendreq.h"
 #include "pml_ob1_hdr.h"
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
                    ompi_free_list_item_t,
@ -334,7 +334,7 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
        OPAL_THREAD_ADD32(&sendreq->req_state, -1);
    }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
        (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) {
        /* The user's buffer is GPU and this BTL can support asynchronous copies,
@ -343,7 +343,7 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
        void *strm = mca_common_cuda_get_dtoh_stream();
        opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm);
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    if(send_request_pml_complete_check(sendreq) == false)
        mca_pml_ob1_send_request_schedule(sendreq);
@ -364,7 +364,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
    }
    ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
    recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    /* If data is destined for GPU buffer and convertor was set up for asynchronous
     * copies, then start the copy and return.  The copy completion will trigger
     * the next phase. */
@ -379,7 +379,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,

        return;
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
    mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);

    return;
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@ -37,15 +37,15 @@
 #include "ompi/mca/bml/base/base.h" 
 #include "opal/util/arch.h"
 #include "ompi/memchecker.h"
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
                                  mca_btl_base_module_t* btl);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 void mca_pml_ob1_recv_request_process_pending(void)
 {
@ -530,7 +530,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
    }
 }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 /**
 * This function is basically the first half of the code in the
 * mca_pml_ob1_recv_request_progress_frag function.  This fires off
@ -607,7 +607,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
    }
 }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /*
 * Update the recv request status to reflect the number of bytes
@ -638,9 +638,9 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
     * sender side is already registered. We need to be smarter here, perhaps
     * do couple of RDMA reads */
    if (opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
        if (mca_pml_ob1_cuda_need_buffers(recvreq, btl))
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
        {
            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
            return;
@ -651,7 +651,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
    bml_endpoint = (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
    rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
    if (OPAL_UNLIKELY(NULL == rdma_bml)) {
        if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) {
            mca_bml_base_btl_t *bml_btl;
@ -666,7 +666,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
            return;
        }
    }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

    if (OPAL_UNLIKELY(NULL == rdma_bml)) {
        opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
@ -786,7 +786,7 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq
        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
    }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    /* If BTL supports it and this is a CUDA buffer being received into,
     * have all subsequent FRAGS copied in asynchronously. */
    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@ -304,7 +304,7 @@ void mca_pml_ob1_recv_request_progress_frag(
    mca_btl_base_segment_t* segments,
    size_t num_segments);

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 void mca_pml_ob1_recv_request_frag_copy_start(
    mca_pml_ob1_recv_request_t* req,
    struct mca_btl_base_module_t* btl,
@ -316,7 +316,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished(struct mca_btl_base_module_t* b
    struct mca_btl_base_endpoint_t* ep,
    struct mca_btl_base_descriptor_t* des,
    int status );
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
 /**
 *
 */
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -331,7 +331,7 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 }

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
 /**
 * This function is called when the copy of the frag from the GPU buffer
 * to the internal buffer is complete.  Used to support asynchronous
@ -362,7 +362,7 @@ mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl,
        orte_errmgr.abort(-1, NULL);
    }
 }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /**
 *  Buffer the entire message and mark as complete.
@ -1062,7 +1062,7 @@ cannot_pack:
                 &(sendreq->req_send.req_base), size, PERUSE_SEND);
 #endif  /* OMPI_WANT_PERUSE */

-#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
         /* At this point, check to see if the BTL is doing an asynchronous
          * copy.  This would have been initiated in the mca_bml_base_prepare_src
          * called above.  The flag is checked here as we let the hdr be
@ -1086,7 +1086,7 @@ cannot_pack:
            }
            continue;
        }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

        /* initiate send - note that this may complete before the call returns */
        rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@ -320,12 +320,12 @@ mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq)
    mca_pml_ob1_send_request_schedule_exclusive(sendreq);
 }

-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
 int mca_pml_ob1_send_request_start_cuda(
    mca_pml_ob1_send_request_t* sendreq, 
    mca_bml_base_btl_t* bml_btl,
    size_t size);
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */

 /**
 *  Start the specified request
@ -411,11 +411,11 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
            }
        } else {
-#if OMPI_CUDA_SUPPORT
+#if OPAL_CUDA_SUPPORT
            if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
                return mca_pml_ob1_send_request_start_cuda(sendreq, bml_btl, size);
            }
-#endif /* OMPI_CUDA_SUPPORT */
+#endif /* OPAL_CUDA_SUPPORT */
            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
        }
    }
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@ -63,7 +63,7 @@ int ompi_mpi_leave_pinned = -1;
 bool ompi_mpi_leave_pinned_pipeline = false;
 bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
 bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
-bool ompi_mpi_built_with_cuda_support = OPAL_INT_TO_BOOL(OMPI_CUDA_SUPPORT);
+bool ompi_mpi_built_with_cuda_support = OPAL_INT_TO_BOOL(OPAL_CUDA_SUPPORT);
 bool ompi_mpi_cuda_support;

 uint32_t ompi_hostname_cutoff = UINT32_MAX;