Update smcuda to match recent changes in sm BTL.

This commit was SVN r27803.
2013-01-14 14:42:19 +00:00 · 2013-01-14 14:42:19 +00:00 · a07a4bb3f7
--- a/ompi/mca/btl/smcuda/btl_smcuda.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda.c
@ -34,14 +34,22 @@
 #include <sys/mman.h>
 #endif  /* HAVE_SYS_MMAN_H */

+#ifdef OMPI_BTL_SM_CMA_NEED_SYSCALL_DEFS
+#include "opal/sys/cma.h"
+#endif /* OMPI_BTL_SM_CMA_NEED_SYSCALL_DEFS */
+
 #include "opal/sys/atomic.h"
 #include "opal/class/opal_bitmap.h"
 #include "opal/util/output.h"
 #include "opal/util/printf.h"
 #include "opal/mca/hwloc/base/base.h"
+#include "opal/mca/shmem/base/base.h"
+#include "opal/mca/shmem/shmem.h"
 #include "orte/util/proc_info.h"
+#include "orte/util/show_help.h"
 #include "opal/datatype/opal_convertor.h"
 #include "ompi/class/ompi_free_list.h"
+#include "ompi/runtime/ompi_module_exchange.h"
 #include "ompi/mca/btl/btl.h"
 #if OMPI_CUDA_SUPPORT
 #include "ompi/mca/common/cuda/common_cuda.h"
@ -83,7 +91,7 @@ mca_btl_smcuda_t mca_btl_smcuda = {
        mca_btl_smcuda_alloc,
        mca_btl_smcuda_free,
        mca_btl_smcuda_prepare_src,
-#if OMPI_CUDA_SUPPORT
+#if OMPI_CUDA_SUPPORT || OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA
        mca_btl_smcuda_prepare_dst,
 #else
        NULL,
@ -92,7 +100,7 @@ mca_btl_smcuda_t mca_btl_smcuda = {
        mca_btl_smcuda_sendi,
        NULL,  /* put */
        NULL,  /* get -- optionally filled during initialization */
-        mca_btl_base_dump,
+        mca_btl_smcuda_dump,
        NULL, /* mpool */
        mca_btl_smcuda_register_error_cb, /* register error */
        mca_btl_smcuda_ft_event
@ -110,7 +118,6 @@ mca_btl_smcuda_t mca_btl_smcuda = {
 */
 #define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE))

-
 static void *mpool_calloc(size_t nmemb, size_t size)
 {
    void *buf;
@ -126,16 +133,104 @@ static void *mpool_calloc(size_t nmemb, size_t size)
    return buf;
 }

-
-static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int n)
+static int
+setup_mpool_base_resources(mca_btl_smcuda_component_t *comp_ptr,
+                           mca_mpool_base_resources_t *out_res)
 {
-    size_t size, length, length_payload;
-    char *sm_ctl_file;
+    int rc = OMPI_SUCCESS;
+    int fd = -1;
+    ssize_t bread = 0;
+
+    if (-1 == (fd = open(comp_ptr->sm_mpool_rndv_file_name, O_RDONLY))) {
+        int err = errno;
+        orte_show_help("help-mpi-btl-smcuda.txt", "sys call fail", true,
+                       "open(2)", strerror(err), err);
+        rc = OMPI_ERR_IN_ERRNO;
+        goto out;
+    }
+    if ((ssize_t)sizeof(opal_shmem_ds_t) != (bread =
+        read(fd, &out_res->bs_meta_buf, sizeof(opal_shmem_ds_t)))) {
+        opal_output(0, "setup_mpool_base_resources: "
+                    "Read inconsistency -- read: %lu, but expected: %lu!\n",
+                    (unsigned long)bread,
+                    (unsigned long)sizeof(opal_shmem_ds_t));
+        rc = OMPI_ERROR;
+        goto out;
+    }
+    if ((ssize_t)sizeof(out_res->size) != (bread =
+        read(fd, &out_res->size, sizeof(size_t)))) {
+        opal_output(0, "setup_mpool_base_resources: "
+                    "Read inconsistency -- read: %lu, but expected: %lu!\n",
+                    (unsigned long)bread,
+                    (unsigned long)sizeof(opal_shmem_ds_t));
+        rc = OMPI_ERROR;
+        goto out;
+    }
+
+out:
+    if (-1 != fd) {
+        (void)close(fd);
+    }
+    return rc;
+}
+
+static int
+sm_segment_attach(mca_btl_smcuda_component_t *comp_ptr)
+{
+    int rc = OMPI_SUCCESS;
+    int fd = -1;
+    ssize_t bread = 0;
+    opal_shmem_ds_t *tmp_shmem_ds = calloc(1, sizeof(*tmp_shmem_ds));
+
+    if (NULL == tmp_shmem_ds) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    if (-1 == (fd = open(comp_ptr->sm_rndv_file_name, O_RDONLY))) {
+        int err = errno;
+        orte_show_help("help-mpi-btl-sm.txt", "sys call fail", true,
+                       "open(2)", strerror(err), err);
+        rc = OMPI_ERR_IN_ERRNO;
+        goto out;
+    }
+    if ((ssize_t)sizeof(opal_shmem_ds_t) != (bread =
+        read(fd, tmp_shmem_ds, sizeof(opal_shmem_ds_t)))) {
+        opal_output(0, "sm_segment_attach: "
+                    "Read inconsistency -- read: %lu, but expected: %lu!\n",
+                    (unsigned long)bread,
+                    (unsigned long)sizeof(opal_shmem_ds_t));
+        rc = OMPI_ERROR;
+        goto out;
+    }
+    if (NULL == (comp_ptr->sm_seg =
+                 mca_common_sm_module_attach(tmp_shmem_ds,
+                                             sizeof(mca_common_sm_seg_header_t),
+                                             opal_cache_line_size))) {
+        /* don't have to detach here, because module_attach cleans up after
+         * itself on failure. */
+        opal_output(0, "sm_segment_attach: "
+                    "mca_common_sm_module_attach failure!\n");
+        return OMPI_ERROR;
+    }
+
+out:
+    if (-1 != fd) {
+        (void)close(fd);
+    }
+    if (tmp_shmem_ds) {
+        free(tmp_shmem_ds);
+    }
+    return rc;
+}
+
+static int
+smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
+                       int32_t my_smp_rank,
+                       int n)
+{
+    size_t length, length_payload;
    sm_fifo_t *my_fifos;
-    int my_mem_node, num_mem_nodes, i;
-    ompi_proc_t **procs;
-    size_t num_procs;
-    mca_mpool_base_resources_t res;
+    int my_mem_node, num_mem_nodes, i, rc;
+    mca_mpool_base_resources_t *res = NULL;
    mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component;

    /* Assume we don't have hwloc support and fill in dummy info */
@ -189,67 +284,34 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int n)
    }
 #endif

-    /* lookup shared memory pool */
-    mca_btl_smcuda_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes,
-                                            sizeof(mca_mpool_base_module_t*));
-
-    /* Create one mpool.  Per discussion with George and a UTK Euro
-       MPI 2010 paper, it may be beneficial to create multiple mpools.
-       Leaving that for a future optimization, however. */
-    /* Disable memory binding, because each MPI process will claim
-       pages in the mpool for their local NUMA node */
-    res.mem_node = -1;
-
-    /* determine how much memory to create */
-    /*
-     * This heuristic formula mostly says that we request memory for:
-     * - nfifos FIFOs, each comprising:
-     *   . a sm_fifo_t structure
-     *   . many pointers (fifo_size of them per FIFO)
-     * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
-     * - max fragments (sm_free_list_num of them)
-     *
-     * On top of all that, we sprinkle in some number of
-     * "opal_cache_line_size" additions to account for some
-     * padding and edge effects that may lie in the allocator.
-     */
-    res.size =
-        FIFO_MAP_NUM(n) * ( sizeof(sm_fifo_t) + sizeof(void *) * m->fifo_size + 4 * opal_cache_line_size )
-        + ( 2 * n + m->sm_free_list_inc ) * ( m->eager_limit   + 2 * opal_cache_line_size )
-        +           m->sm_free_list_num   * ( m->max_frag_size + 2 * opal_cache_line_size );
-
-    /* before we multiply by n, make sure the result won't overflow */
-    /* Stick that little pad in, particularly since we'll eventually
-     * need a little extra space.  E.g., in mca_mpool_sm_init() in
-     * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
-     * added.
-     */
-    if ( ((double) res.size) * n > LONG_MAX - 4096 ) {
+    if (NULL == (res = calloc(1, sizeof(*res)))) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
-    res.size *= n;
-    
-    /* now, create it */
+
+    /* lookup shared memory pool */
+    mca_btl_smcuda_component.sm_mpools =
+        (mca_mpool_base_module_t **)calloc(num_mem_nodes,
+                                           sizeof(mca_mpool_base_module_t *));
+
+    /* Disable memory binding, because each MPI process will claim pages in the
+     * mpool for their local NUMA node */
+    res->mem_node = -1;
+
+    if (OMPI_SUCCESS != (rc = setup_mpool_base_resources(m, res))) {
+        free(res);
+        return rc;
+    }
+    /* now that res is fully populated, create the thing */
    mca_btl_smcuda_component.sm_mpools[0] =
        mca_mpool_base_module_create(mca_btl_smcuda_component.sm_mpool_name,
-                                     smcuda_btl, &res);
+                                     smcuda_btl, res);
    /* Sanity check to ensure that we found it */
    if (NULL == mca_btl_smcuda_component.sm_mpools[0]) {
-            return OMPI_ERR_OUT_OF_RESOURCE;
+        free(res);
+        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    mca_btl_smcuda_component.sm_mpool = mca_btl_smcuda_component.sm_mpools[0];
-#if OMPI_CUDA_SUPPORT
-    /* Create a local memory pool that sends handles to the remote
-     * side.  Note that the res argument is not really used, but
-     * needed to satisfy function signature. */
-    smcuda_btl->super.btl_mpool = mca_mpool_base_module_create("gpusm",
-                                                               smcuda_btl,
-                                                               &res);
-    if (NULL == smcuda_btl->super.btl_mpool) {
-        return OMPI_ERR_OUT_OF_RESOURCE;
-    }
-#endif /* OMPI_CUDA_SUPPORT */

    mca_btl_smcuda_component.sm_mpool_base =
        mca_btl_smcuda_component.sm_mpools[0]->mpool_base(mca_btl_smcuda_component.sm_mpools[0]);
@ -258,37 +320,30 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int n)
    mca_btl_smcuda_component.sm_peers = (struct mca_btl_base_endpoint_t**)
        calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
    if (NULL == mca_btl_smcuda_component.sm_peers) {
+        free(res);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
-
-    /* Allocate Shared Memory BTL process coordination
-     * data structure.  This will reside in shared memory */
-
-    /* set file name */
-    if (asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s",
-                 orte_process_info.job_session_dir,
-                 orte_process_info.nodename) < 0) {
+    /* remember that node rank zero is already attached */
+    if (0 != my_smp_rank) {
+        if (OMPI_SUCCESS != (rc = sm_segment_attach(m))) {
+            free(res);
+            return rc;
+        }
+    }
+#if OMPI_CUDA_SUPPORT
+    /* Create a local memory pool that sends handles to the remote
+     * side.  Note that the res argument is not really used, but
+     * needed to satisfy function signature. */
+    smcuda_btl->super.btl_mpool = mca_mpool_base_module_create("gpusm",
+                                                               smcuda_btl,
+                                                               res);
+    if (NULL == smcuda_btl->super.btl_mpool) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
+#endif /* OMPI_CUDA_SUPPORT */

-    /* Pass in a data segment alignment of 0 to get no data
-       segment (only the shared control structure) */
-    size = sizeof(mca_common_sm_seg_header_t) +
-        n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size;
-    procs = ompi_proc_world(&num_procs);
-    if (!(mca_btl_smcuda_component.sm_seg =
-          mca_common_sm_init(procs, num_procs, size, sm_ctl_file,
-                             sizeof(mca_common_sm_seg_header_t),
-                             opal_cache_line_size))) {
-        opal_output(0, "mca_btl_smcuda_add_procs: unable to create shared memory "
-                    "BTL coordinating strucure :: size %lu \n",
-                    (unsigned long)size);
-        free(procs);
-        free(sm_ctl_file);
-        return OMPI_ERROR;
-    }
-    free(procs);
-    free(sm_ctl_file);
+    /* it is now safe to free the mpool resources */
+    free(res);

    /* check to make sure number of local procs is within the
     * specified limits */
@ -387,6 +442,7 @@ static struct mca_btl_base_endpoint_t *
 create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
 {
    struct mca_btl_base_endpoint_t *ep;
+
 #if OMPI_ENABLE_PROGRESS_THREADS == 1
    char path[PATH_MAX];
 #endif
@ -426,22 +482,6 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
    return ep;
 }

-static void calc_sm_max_procs(int n)
-{
-    /* see if need to allocate space for extra procs */
-    if(0 > mca_btl_smcuda_component.sm_max_procs) {
-        /* no limit */
-        if(0 <= mca_btl_smcuda_component.sm_extra_procs) {
-            /* limit */
-            mca_btl_smcuda_component.sm_max_procs =
-                n + mca_btl_smcuda_component.sm_extra_procs;
-        } else {
-            /* no limit */
-            mca_btl_smcuda_component.sm_max_procs = 2 * n;
-        }
-    }
-}
-
 int mca_btl_smcuda_add_procs(
    struct mca_btl_base_module_t* btl,
    size_t nprocs,
@ -455,6 +495,9 @@ int mca_btl_smcuda_add_procs(
    mca_btl_smcuda_t *smcuda_btl;
    bool have_connected_peer = false;
    char **bases;
+    /* for easy access to the mpool_sm_module */
+    mca_mpool_sm_module_t *sm_mpool_modp = NULL;
+
    /* initializion */

    smcuda_btl = (mca_btl_smcuda_t *)btl;
@ -467,7 +510,7 @@ int mca_btl_smcuda_add_procs(
     * and idetify procs that are on this host.  Add procs on this
     * host to shared memory reachbility list.  Also, get number
     * of local procs in the procs list. */
-    for(proc = 0; proc < (int32_t)nprocs; proc++) {
+    for (proc = 0; proc < (int32_t)nprocs; proc++) {
        /* check to see if this proc can be reached via shmem (i.e.,
           if they're on my local host and in my job) */
        if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
@ -502,18 +545,18 @@ int mca_btl_smcuda_add_procs(
        goto CLEANUP;

    /* make sure that my_smp_rank has been defined */
-    if(-1 == my_smp_rank) {
+    if (-1 == my_smp_rank) {
        return_code = OMPI_ERROR;
        goto CLEANUP;
    }

-    calc_sm_max_procs(n_local_procs);
-
    if (!smcuda_btl->btl_inited) {
        return_code =
-            smcuda_btl_first_time_init(smcuda_btl, mca_btl_smcuda_component.sm_max_procs);
-        if(return_code != OMPI_SUCCESS)
+            smcuda_btl_first_time_init(smcuda_btl, my_smp_rank,
+                                   mca_btl_smcuda_component.sm_max_procs);
+        if (return_code != OMPI_SUCCESS) {
            goto CLEANUP;
+        }
    }

    /* set local proc's smp rank in the peers structure for
@ -526,6 +569,7 @@ int mca_btl_smcuda_add_procs(
    }

    bases = mca_btl_smcuda_component.shm_bases;
+    sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_smcuda_component.sm_mpool;

    /* initialize own FIFOs */
    /*
@ -549,13 +593,48 @@ int mca_btl_smcuda_add_procs(
    /* Sync with other local procs. Force the FIFO initialization to always
     * happens before the readers access it.
     */
-    opal_atomic_add_32( &mca_btl_smcuda_component.sm_seg->module_seg->seg_inited, 1);
+    opal_atomic_add_32(&mca_btl_smcuda_component.sm_seg->module_seg->seg_inited, 1);
    while( n_local_procs >
           mca_btl_smcuda_component.sm_seg->module_seg->seg_inited) {
        opal_progress();
        opal_atomic_rmb();
    }

+    /* it is now safe to unlink the shared memory segment. only one process
+     * needs to do this, so just let smp rank zero take care of it. */
+    if (0 == my_smp_rank) {
+        if (OMPI_SUCCESS !=
+            mca_common_sm_module_unlink(mca_btl_smcuda_component.sm_seg)) {
+            /* it is "okay" if this fails at this point. we have gone this far,
+             * so just warn about the failure and continue. this is probably
+             * only triggered by a programming error. */
+            opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
+        }
+        /* SKG - another abstraction violation here, but I don't want to add
+         * extra code in the sm mpool for further synchronization. */
+
+        /* at this point, all processes have attached to the mpool segment. so
+         * it is safe to unlink it here. */
+        if (OMPI_SUCCESS !=
+            mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) {
+            opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
+        }
+        if (-1 == unlink(mca_btl_smcuda_component.sm_mpool_rndv_file_name)) {
+            opal_output(0, "WARNING: %s unlink failed.\n",
+                        mca_btl_smcuda_component.sm_mpool_rndv_file_name);
+        }
+        if (-1 == unlink(mca_btl_smcuda_component.sm_rndv_file_name)) {
+            opal_output(0, "WARNING: %s unlink failed.\n",
+                        mca_btl_smcuda_component.sm_rndv_file_name);
+        }
+    }
+
+    /* free up some space used by the name buffers */
+    free(mca_btl_smcuda_component.sm_mpool_ctl_file_name);
+    free(mca_btl_smcuda_component.sm_mpool_rndv_file_name);
+    free(mca_btl_smcuda_component.sm_ctl_file_name);
+    free(mca_btl_smcuda_component.sm_rndv_file_name);
+
    /* coordinate with other processes */
    for(j = mca_btl_smcuda_component.num_smp_procs;
        j < mca_btl_smcuda_component.num_smp_procs + n_local_procs; j++) {
@ -699,6 +778,7 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
    uint32_t iov_count = 1;
    size_t max_data = *size;
    int rc;
+
 #if OMPI_CUDA_SUPPORT
    if (0 != reserve) {
 #endif /* OMPI_CUDA_SUPPORT */
@ -767,7 +847,7 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
 }

 #if 0
-#define MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag)      \
+#define MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag)          \
    do {                                                                \
        char* _memory = (char*)(sm_frag)->segment.base.seg_addr.pval +  \
            (sm_frag)->segment.base.seg_len;                            \
@ -1054,6 +1134,32 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
 }
 #endif /* OMPI_CUDA_SUPPORT */

+/**
+ *
+ */
+void mca_btl_smcuda_dump(struct mca_btl_base_module_t* btl,
+                     struct mca_btl_base_endpoint_t* endpoint,
+                     int verbose)
+{
+    opal_list_item_t *item;
+    mca_btl_smcuda_frag_t* frag;
+
+    mca_btl_base_err("BTL SM %p endpoint %p [smp_rank %d] [peer_rank %d]\n",
+                     (void*) btl, (void*) endpoint, 
+                     endpoint->my_smp_rank, endpoint->peer_smp_rank);
+    if( NULL != endpoint ) {
+        for(item =  opal_list_get_first(&endpoint->pending_sends);
+            item != opal_list_get_end(&endpoint->pending_sends); 
+            item = opal_list_get_next(item)) {
+            frag = (mca_btl_smcuda_frag_t*)item;
+            mca_btl_base_err(" |  frag %p size %lu (hdr frag %p len %lu rank %d tag %d)\n",
+                             (void*) frag, frag->size, (void*) frag->hdr->frag,
+                             frag->hdr->len, frag->hdr->my_smp_rank, 
+                             frag->hdr->tag);
+        }
+    }
+}
+
 #if OPAL_ENABLE_FT_CR    == 0
 int mca_btl_smcuda_ft_event(int state) {
    return OMPI_SUCCESS;
--- a/ompi/mca/btl/smcuda/btl_smcuda.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda.h
@ -39,6 +39,7 @@

 #include "opal/util/bit_ops.h"
 #include "opal/class/opal_free_list.h"
+
 #include "ompi/mca/btl/btl.h"
 #include "ompi/mca/common/sm/common_sm.h"

@ -191,6 +192,16 @@ struct mca_btl_smcuda_component_t {
    /** If we want DMA and DMA is supported, this will be loaded with
        KNEM_FLAG_DMA.  Otherwise, it'll be 0. */
    int knem_dma_flag;
+
+    /** MCA: should we be using CMA or not?
+        0 = no, 1 = yes */
+    int use_cma;
+
+    /* /// well-known file names for sm and sm mpool init /// */
+    char *sm_mpool_ctl_file_name;
+    char *sm_mpool_rndv_file_name;
+    char *sm_ctl_file_name;
+    char *sm_rndv_file_name;
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
 OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
@ -207,10 +218,6 @@ struct mca_btl_smcuda_t {
 typedef struct mca_btl_smcuda_t mca_btl_smcuda_t;
 OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda;

-
-
-
-
 struct btl_smcuda_pending_send_item_t
 {
    opal_free_list_item_t super;
@ -484,6 +491,11 @@ extern struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
 		uint32_t flags);
 #endif /* OMPI_CUDA_SUPPORT */

+
+extern void mca_btl_smcuda_dump(struct mca_btl_base_module_t* btl,
+                            struct mca_btl_base_endpoint_t* endpoint,
+                            int verbose);
+
 /**
 * Fault Tolerance Event Notification Function
 * @param state Checkpoint Stae
--- a/ompi/mca/btl/smcuda/btl_smcuda_component.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda_component.c
@ -41,22 +41,25 @@
 #include <sys/stat.h>  /* for mkfifo */
 #endif  /* HAVE_SYS_STAT_H */

-#include "ompi/constants.h"
-#include "opal/mca/event/event.h"
+#include "opal/mca/base/mca_base_param.h"
+#include "opal/mca/shmem/base/base.h"
+#include "opal/mca/shmem/shmem.h"
 #include "opal/util/bit_ops.h"
 #include "opal/util/output.h"
-#include "orte/util/proc_info.h"
+
 #include "orte/util/show_help.h"
 #include "orte/runtime/orte_globals.h"
+#include "orte/util/proc_info.h"

-#include "opal/mca/base/mca_base_param.h"
+#include "ompi/constants.h"
+#include "ompi/runtime/ompi_module_exchange.h"
 #include "ompi/mca/mpool/base/base.h"
+#include "ompi/mca/common/sm/common_sm.h"
+#include "ompi/mca/btl/base/btl_base_error.h"
 #if OMPI_CUDA_SUPPORT
 #include "ompi/runtime/params.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
 #endif /* OMPI_CUDA_SUPPORT */
-#include "ompi/mca/common/sm/common_sm.h"
-#include "ompi/mca/btl/base/btl_base_error.h"

 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@ -75,6 +78,10 @@ static mca_btl_base_module_t** mca_btl_smcuda_component_init(
    bool enable_mpi_threads
 );

+typedef enum {
+    MCA_BTL_SM_RNDV_MOD_SM = 0,
+    MCA_BTL_SM_RNDV_MOD_MPOOL
+} mca_btl_sm_rndv_module_type_t;

 /*
 * Shared Memory (SM) component instance.
@ -290,49 +297,417 @@ CLEANUP:
    return return_value;
 }

+/* 
+ * Returns the number of processes on the node.
+ */
+static inline int
+get_num_local_procs(void)
+{
+    /* num_local_peers does not include us in
+     * its calculation, so adjust for that */
+    return (int)(1 + orte_process_info.num_local_peers);
+}
+
+static void
+calc_sm_max_procs(int n)
+{
+    /* see if need to allocate space for extra procs */
+    if (0 > mca_btl_smcuda_component.sm_max_procs) {
+        /* no limit */
+        if (0 <= mca_btl_smcuda_component.sm_extra_procs) {
+            /* limit */
+            mca_btl_smcuda_component.sm_max_procs =
+                n + mca_btl_smcuda_component.sm_extra_procs;
+        } else {
+            /* no limit */
+            mca_btl_smcuda_component.sm_max_procs = 2 * n;
+        }
+    }
+}
+
+static int
+create_and_attach(mca_btl_smcuda_component_t *comp_ptr,
+                  size_t size,
+                  char *file_name,
+                  size_t size_ctl_structure,
+                  size_t data_seg_alignment,
+                  mca_common_sm_module_t **out_modp)
+
+{
+    if (NULL == (*out_modp =
+        mca_common_sm_module_create_and_attach(size, file_name,
+                                               size_ctl_structure,
+                                               data_seg_alignment))) {
+        opal_output(0, "create_and_attach: unable to create shared memory "
+                    "BTL coordinating strucure :: size %lu \n",
+                    (unsigned long)size);
+        return OMPI_ERROR;
+    }
+    return OMPI_SUCCESS;
+}
+
+/*
+ * SKG - I'm not happy with this, but I can't figure out a better way of
+ * finding the sm mpool's minimum size 8-|. The way I see it. This BTL only
+ * uses the sm mpool, so maybe this isn't so bad...
+ *
+ * The problem is the we need to size the mpool resources at sm BTL component
+ * init. That means we need to know the mpool's minimum size at create.
+ */
+static int
+get_min_mpool_size(mca_btl_smcuda_component_t *comp_ptr,
+                   size_t *out_size)
+{
+    char *type_name = "mpool";
+    char *param_name = "min_size";
+    char *min_size = NULL;
+    int id = 0;
+    size_t default_min = 67108864;
+    size_t size = 0;
+    long tmp_size = 0;
+
+    if (0 > (id = mca_base_param_find(type_name, comp_ptr->sm_mpool_name,
+                                      param_name))) {
+        opal_output(0, "mca_base_param_find: failure looking for %s_%s_%s\n",
+                    type_name, comp_ptr->sm_mpool_name, param_name);
+        return OMPI_ERR_NOT_FOUND;
+    }
+    if (OPAL_ERROR == mca_base_param_lookup_string(id, &min_size)) {
+        opal_output(0, "mca_base_param_lookup_string failure\n");
+        return OMPI_ERROR;
+    }
+    errno = 0;
+    tmp_size = strtol(min_size, (char **)NULL, 10);
+    if (ERANGE == errno || EINVAL == errno || tmp_size <= 0) {
+        opal_output(0, "mca_btl_sm::get_min_mpool_size: "
+                       "Unusable %s_%s_min_size provided. "
+                       "Continuing with %lu.", type_name,
+                       comp_ptr->sm_mpool_name,
+                       (unsigned long)default_min);
+
+        size = default_min;
+    }
+    else {
+        size = (size_t)tmp_size;
+    }
+    free(min_size);
+    *out_size = size;
+    return OMPI_SUCCESS;
+}
+
+static int
+get_mpool_res_size(int32_t max_procs,
+                   size_t *out_res_size)
+{
+    size_t size = 0;
+
+    *out_res_size = 0;
+    /* determine how much memory to create */
+    /*
+     * This heuristic formula mostly says that we request memory for:
+     * - nfifos FIFOs, each comprising:
+     *   . a sm_fifo_t structure
+     *   . many pointers (fifo_size of them per FIFO)
+     * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
+     * - max fragments (sm_free_list_num of them)
+     *
+     * On top of all that, we sprinkle in some number of
+     * "opal_cache_line_size" additions to account for some
+     * padding and edge effects that may lie in the allocator.
+     */
+    size = FIFO_MAP_NUM(max_procs) *
+           (sizeof(sm_fifo_t) + sizeof(void *) *
+            mca_btl_smcuda_component.fifo_size + 4 * opal_cache_line_size) +
+           (2 * max_procs + mca_btl_smcuda_component.sm_free_list_inc) *
+           (mca_btl_smcuda_component.eager_limit + 2 * opal_cache_line_size) +
+           mca_btl_smcuda_component.sm_free_list_num *
+           (mca_btl_smcuda_component.max_frag_size + 2 * opal_cache_line_size);
+
+    /* add something for the control structure */
+    size += sizeof(mca_common_sm_module_t);
+
+    /* before we multiply by max_procs, make sure the result won't overflow */
+    /* Stick that little pad in, particularly since we'll eventually
+     * need a little extra space.  E.g., in mca_mpool_sm_init() in
+     * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
+     * added.
+     */
+    if (((double)size) * max_procs > LONG_MAX - 4096) {
+        return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
+    }
+    size *= (size_t)max_procs;
+    *out_res_size = size;
+    return OMPI_SUCCESS;
+}
+
+
+/* Generates all the unique paths for the shared-memory segments that this BTL
+ * needs along with other file paths used to share "connection information". */
+static int
+set_uniq_paths_for_init_rndv(mca_btl_smcuda_component_t *comp_ptr)
+{
+    int rc = OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* NOTE: don't forget to free these after init */
+    comp_ptr->sm_mpool_ctl_file_name = NULL;
+    comp_ptr->sm_mpool_rndv_file_name = NULL;
+    comp_ptr->sm_ctl_file_name = NULL;
+    comp_ptr->sm_rndv_file_name = NULL;
+
+    if (asprintf(&comp_ptr->sm_mpool_ctl_file_name,
+                 "%s"OPAL_PATH_SEP"shared_mem_cuda_pool.%s",
+                 orte_process_info.job_session_dir,
+                 orte_process_info.nodename) < 0) {
+        /* rc set */
+        goto out;
+    }
+    if (asprintf(&comp_ptr->sm_mpool_rndv_file_name,
+                 "%s"OPAL_PATH_SEP"shared_mem_cuda_pool_rndv.%s",
+                 orte_process_info.job_session_dir,
+                 orte_process_info.nodename) < 0) {
+        /* rc set */
+        goto out;
+    }
+    if (asprintf(&comp_ptr->sm_ctl_file_name,
+                 "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_module.%s",
+                 orte_process_info.job_session_dir,
+                 orte_process_info.nodename) < 0) {
+        /* rc set */
+        goto out;
+    }
+    if (asprintf(&comp_ptr->sm_rndv_file_name,
+                 "%s"OPAL_PATH_SEP"shared_mem_cuda_btl_rndv.%s",
+                 orte_process_info.job_session_dir,
+                 orte_process_info.nodename) < 0) {
+        /* rc set */
+        goto out;
+    }
+    /* all is well */
+    rc = OMPI_SUCCESS;
+
+out:
+    if (OMPI_SUCCESS != rc) {
+        if (comp_ptr->sm_mpool_ctl_file_name) {
+            free(comp_ptr->sm_mpool_ctl_file_name);
+        }
+        if (comp_ptr->sm_mpool_rndv_file_name) {
+            free(comp_ptr->sm_mpool_rndv_file_name);
+        }
+        if (comp_ptr->sm_ctl_file_name) {
+            free(comp_ptr->sm_ctl_file_name);
+        }
+        if (comp_ptr->sm_rndv_file_name) {
+            free(comp_ptr->sm_rndv_file_name);
+        }
+    }
+    return rc;
+}
+
+static int
+create_rndv_file(mca_btl_smcuda_component_t *comp_ptr,
+                  mca_btl_sm_rndv_module_type_t type)
+{
+    size_t size = 0;
+    int rc = OMPI_SUCCESS;
+    int fd = -1;
+    char *fname = NULL;
+    /* used as a temporary store so we can extract shmem_ds info */
+    mca_common_sm_module_t *tmp_modp = NULL;
+
+    if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
+        size_t min_size = 0;
+        /* get the segment size for the sm mpool. */
+        if (OMPI_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs,
+                                                     &size))) {
+            /* rc is already set */
+            goto out;
+        }
+        /* do we need to update the size based on the sm mpool's min size? */
+        if (OMPI_SUCCESS != (rc = get_min_mpool_size(comp_ptr, &min_size))) {
+            goto out;
+        }
+        /* update size if less than required minimum */
+        if (size < min_size) {
+            size = min_size;
+        }
+        /* we only need the shmem_ds info at this point. initilization will be
+         * completed in the mpool module code. the idea is that we just need this
+         * info so we can populate the rndv file (or modex when we have it). */
+        if (OMPI_SUCCESS != (rc =
+            create_and_attach(comp_ptr, size, comp_ptr->sm_mpool_ctl_file_name,
+                              sizeof(mca_common_sm_module_t), 8, &tmp_modp))) {
+            /* rc is set */
+            goto out;
+        }
+        fname = comp_ptr->sm_mpool_rndv_file_name;
+    }
+    else if (MCA_BTL_SM_RNDV_MOD_SM == type) {
+        /* calculate the segment size. */
+        size = sizeof(mca_common_sm_seg_header_t) +
+               comp_ptr->sm_max_procs *
+               (sizeof(sm_fifo_t *) +
+                sizeof(char *) + sizeof(uint16_t)) +
+               opal_cache_line_size;
+
+        if (OMPI_SUCCESS != (rc =
+            create_and_attach(comp_ptr, size, comp_ptr->sm_ctl_file_name,
+                              sizeof(mca_common_sm_seg_header_t),
+                              opal_cache_line_size, &comp_ptr->sm_seg))) {
+            /* rc is set */
+            goto out;
+        }
+        fname = comp_ptr->sm_rndv_file_name;
+        tmp_modp = comp_ptr->sm_seg;
+    }
+    else {
+        return OMPI_ERR_BAD_PARAM;
+    }
+
+    /* at this point, we have all the info we need to populate the rendezvous
+     * file containing all the meta info required for attach. */
+
+    /* now just write the contents of tmp_modp->shmem_ds to the full
+     * sizeof(opal_shmem_ds_t), so we know where the mpool_res_size starts. */
+    if (-1 == (fd = open(fname, O_CREAT | O_RDWR, 0600))) {
+        int err = errno;
+        orte_show_help("help-mpi-btl-sm.txt", "sys call fail", true,
+                       "open(2)", strerror(err), err);
+        rc = OMPI_ERR_IN_ERRNO;
+        goto out;
+    }
+    if ((ssize_t)sizeof(opal_shmem_ds_t) != write(fd, &(tmp_modp->shmem_ds),
+                                                  sizeof(opal_shmem_ds_t))) {
+        int err = errno;
+        orte_show_help("help-mpi-btl-sm.txt", "sys call fail", true,
+                       "write(2)", strerror(err), err);
+        rc = OMPI_ERR_IN_ERRNO;
+        goto out;
+    }
+    if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) {
+        if ((ssize_t)sizeof(size) != write(fd, &size, sizeof(size))) {
+            int err = errno;
+            orte_show_help("help-mpi-btl-sm.txt", "sys call fail", true,
+                           "write(2)", strerror(err), err);
+            rc = OMPI_ERR_IN_ERRNO;
+            goto out;
+        }
+        /* only do this for the mpool case */
+        OBJ_RELEASE(tmp_modp);
+    }
+
+out:
+    if (-1 != fd) {
+        (void)close(fd);
+    }
+    return rc;
+}
+
+/*
+ * Creates information required for the sm modex and modex sends it.
+ */
+static int
+backing_store_init(mca_btl_smcuda_component_t *comp_ptr,
+                   orte_node_rank_t node_rank)
+{
+    int rc = OMPI_SUCCESS;
+
+    if (OMPI_SUCCESS != (rc = set_uniq_paths_for_init_rndv(comp_ptr))) {
+        goto out;
+    }
+    if (0 == node_rank) {
+        /* === sm mpool === */
+        if (OMPI_SUCCESS != (rc =
+            create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_MPOOL))) {
+            goto out;
+        }
+        /* === sm === */
+        if (OMPI_SUCCESS != (rc =
+            create_rndv_file(comp_ptr, MCA_BTL_SM_RNDV_MOD_SM))) {
+            goto out;
+        }
+    }
+
+out:
+    return rc;
+}
+
 /*
 *  SM component initialization
 */
-static mca_btl_base_module_t** mca_btl_smcuda_component_init(
-    int *num_btls,
-    bool enable_progress_threads,
-    bool enable_mpi_threads)
+static mca_btl_base_module_t **
+mca_btl_smcuda_component_init(int *num_btls,
+                          bool enable_progress_threads,
+                          bool enable_mpi_threads)
 {
+    int num_local_procs = 0;
    mca_btl_base_module_t **btls = NULL;
+    orte_node_rank_t my_node_rank = ORTE_NODE_RANK_INVALID;

    *num_btls = 0;
-
-    /* if no session directory was created, then we cannot be used */
-    if (!orte_create_session_dirs) {
-        return NULL;
-    }
-    
    /* lookup/create shared memory pool only when used */
    mca_btl_smcuda_component.sm_mpool = NULL;
    mca_btl_smcuda_component.sm_mpool_base = NULL;

-#if OMPI_ENABLE_PROGRESS_THREADS == 1
-    /* create a named pipe to receive events  */
-    sprintf( mca_btl_smcuda_component.sm_fifo_path,
-             "%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir,
-             (unsigned long)ORTE_PROC_MY_NAME->vpid );
-    if(mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
-        opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n",errno);
+    /* if no session directory was created, then we cannot be used */
+    /* SKG - this isn't true anymore. Some backing facilities don't require a
+     * file-backed store. Extend shmem to provide this info one day. Especially
+     * when we use a proper modex for init. */
+    if (!orte_create_session_dirs) {
        return NULL;
    }
-    mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path, O_RDWR);
+    /* if we don't have locality information, then we cannot be used because we
+     * need to know who the respective node ranks for initialization. */
+    if (ORTE_NODE_RANK_INVALID ==
+        (my_node_rank = orte_process_info.my_node_rank)) {
+        orte_show_help("help-mpi-btl-sm.txt", "no locality", true);
+        return NULL;
+    }
+    /* no use trying to use sm with less than two procs, so just bail. */
+    if ((num_local_procs = get_num_local_procs()) < 2) {
+        return NULL;
+    }
+    /* calculate max procs so we can figure out how large to make the
+     * shared-memory segment. this routine sets component sm_max_procs. */
+    calc_sm_max_procs(num_local_procs);
+
+    /* This is where the modex will live some day. For now, just have local rank
+     * 0 create a rendezvous file containing the backing store info, so the
+     * other local procs can read from it during add_procs. The rest will just
+     * stash the known paths for use later in init. */
+    if (OMPI_SUCCESS != backing_store_init(&mca_btl_smcuda_component,
+                                           my_node_rank)) {
+        return NULL;
+    }
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    /* create a named pipe to receive events  */
+    sprintf(mca_btl_smcuda_component.sm_fifo_path,
+             "%s"OPAL_PATH_SEP"sm_fifo.%lu",
+             orte_process_info.job_session_dir,
+             (unsigned long)ORTE_PROC_MY_NAME->vpid);
+    if (mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
+        opal_output(0, "mca_btl_smcuda_component_init: "
+                    "mkfifo failed with errno=%d\n",errno);
+        return NULL;
+    }
+    mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path,
+                                           O_RDWR);
    if(mca_btl_smcuda_component.sm_fifo_fd < 0) {
-        opal_output(0, "mca_btl_smcuda_component_init: open(%s) failed with errno=%d\n",
+        opal_output(0, "mca_btl_smcuda_component_init: "
+                   "open(%s) failed with errno=%d\n",
                    mca_btl_smcuda_component.sm_fifo_path, errno);
        return NULL;
    }

    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_fifo_thread, opal_thread_t);
-    mca_btl_smcuda_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_smcuda_component_event_thread;
+    mca_btl_smcuda_component.sm_fifo_thread.t_run =
+        (opal_thread_fn_t)mca_btl_smcuda_component_event_thread;
    opal_thread_start(&mca_btl_smcuda_component.sm_fifo_thread);
 #endif

-    mca_btl_smcuda_component.sm_btls = (mca_btl_smcuda_t **) malloc( mca_btl_smcuda_component.sm_max_btls * sizeof (mca_btl_smcuda_t *));
+    mca_btl_smcuda_component.sm_btls =
+        (mca_btl_smcuda_t **)malloc(mca_btl_smcuda_component.sm_max_btls *
+                                sizeof(mca_btl_smcuda_t *));
    if (NULL == mca_btl_smcuda_component.sm_btls) {
        return NULL;
    }
@ -361,6 +736,7 @@ static mca_btl_base_module_t** mca_btl_smcuda_component_init(
 	mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
 #endif /* OMPI_CUDA_SUPPORT */

+
    return btls;

 }
@ -482,8 +858,8 @@ int mca_btl_smcuda_component_progress(void)
 #endif
                /* recv upcall */
                reg = mca_btl_base_active_message_trigger + hdr->tag;
-		seg.seg_addr.pval = ((char*)hdr) + sizeof(mca_btl_smcuda_hdr_t);
-		seg.seg_len = hdr->len;
+                seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
+                seg.seg_len = hdr->len;
                Frag.base.des_dst_cnt = 1;
                Frag.base.des_dst = &seg;
                reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
--- a/ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
@ -43,7 +43,7 @@ struct mca_btl_base_endpoint_t {
    opal_list_t pending_sends; /**< pending data to send */

    /** lock for concurrent access to endpoint state */
-    opal_mutex_t                endpoint_lock;
+    opal_mutex_t endpoint_lock;

 };

--- a/ompi/mca/btl/smcuda/btl_smcuda_frag.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_frag.h
@ -54,7 +54,7 @@ struct mca_btl_smcuda_segment_t {
    ompi_ptr_t memh_seg_addr;        
     /** Length in bytes of entire memory handle */
    uint32_t memh_seg_len;           
-#endif
+#endif /* OMPI_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_segment_t mca_btl_smcuda_segment_t;