diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index f07310dd57..21c55e1e82 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. @@ -43,12 +43,9 @@ #include "opal/util/output.h" #include "opal/util/printf.h" #include "opal/mca/hwloc/base/base.h" -#include "opal/mca/shmem/base/base.h" -#include "opal/mca/shmem/shmem.h" #include "orte/util/proc_info.h" #include "opal/datatype/opal_convertor.h" #include "ompi/class/ompi_free_list.h" -#include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/sm/mpool_sm.h" @@ -114,6 +111,7 @@ mca_btl_sm_t mca_btl_sm = { */ #define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE)) + static void *mpool_calloc(size_t nmemb, size_t size) { void *buf; @@ -129,163 +127,17 @@ static void *mpool_calloc(size_t nmemb, size_t size) return buf; } -/* - * Returns a pointer to node rank zero. Returns NULL on error. - */ -static ompi_proc_t * -get_node_rank_zero_proc_ptr(ompi_proc_t **proc_world, - size_t proc_world_size) + +static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) { - size_t num_local_procs = 0; - - if (NULL == proc_world) { - return NULL; - } - /* sort the procs list and get a pointer to the lowest node rank */ - if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(proc_world, - proc_world_size, - &num_local_procs)) { - opal_output(0, "mca_common_sm_local_proc_reorder failure! " - "Cannot continue.\n"); - return NULL; - } - - return proc_world[0]; -} - -static int -do_segmented_modex_recv(mca_btl_sm_component_t *comp_ptr, - mca_btl_sm_modex_t **sm_modex_bufp) -{ - int member_id = 0, rc = OMPI_ERROR; - size_t segment_size = 0, member_offset = 0; - /* start with the full modex buffer size */ - size_t key_len = 0, proc_world_size = 0; - unsigned char *modex_bufp = NULL; - char *key = NULL, *modex_comp_name = NULL; - void *tmp_bufp = NULL; - ompi_proc_t **proc_world = NULL, *proc_node_rank_zero = NULL; - - if (NULL == (modex_bufp = calloc(1, sizeof(mca_btl_sm_modex_t)))) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - /* stash the base of the modex buffer, because modex_bufp gets modified */ - *sm_modex_bufp = (mca_btl_sm_modex_t *)modex_bufp; - if (NULL == (proc_world = ompi_proc_world(&proc_world_size))) { - opal_output(0, "ompi_proc_world failure! Cannot continue.\n"); - rc = OMPI_ERROR; - goto out; - } - if (NULL == (proc_node_rank_zero = - get_node_rank_zero_proc_ptr(proc_world, proc_world_size))) { - opal_output(0, "get_node_rank_zero_proc_ptr failure! " - "Cannot continue.\n"); - rc = OMPI_ERROR; - goto out; - } - if (NULL == (modex_comp_name = - mca_base_component_to_string(&comp_ptr->super.btl_version))) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - /* SM_MODEX_STR_PAD to accommodate the member id and key index */ - key_len = strlen(modex_comp_name) + SM_MODEX_STR_PAD; - if (NULL == (key = calloc(key_len, sizeof(*key)))) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - - /* iterate over all modex members and store their respective pieces */ - for (member_id = 0; member_id < SM_MODEX_NUM_MEMBERS; ++member_id) { - if (OMPI_SUCCESS != (rc = - mca_btl_sm_get_modex_member_off_n_size(*sm_modex_bufp, member_id, - &member_offset, NULL))) { - goto out; - } - if (member_id < 2) { /* for mids 0 and 1 */ - opal_shmem_ds_t *tmp_ds = calloc(1, sizeof(*tmp_ds)); - size_t path_offset = offsetof(opal_shmem_ds_t, seg_name); - char *tmp_pathp = NULL; - if (NULL == tmp_ds) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - (void)snprintf(key, key_len, "%s-%d-%d", - modex_comp_name, member_id, 0); - rc = ompi_modex_recv_string((const char *)key, proc_node_rank_zero, - &tmp_bufp, &segment_size); - (void)memmove(tmp_ds, tmp_bufp, segment_size); - free(tmp_bufp); - /* now copy the path stuff */ - (void)snprintf(key, key_len, "%s-%d-%d", - modex_comp_name, member_id, 1); - rc = ompi_modex_recv_key_value((const char *)key, proc_node_rank_zero, - &tmp_bufp, OPAL_STRING); - tmp_pathp = (char *)tmp_bufp; - (void)memmove((unsigned char *)tmp_ds + path_offset, - tmp_pathp, strlen(tmp_pathp) + 1); - modex_bufp = ((unsigned char *)*sm_modex_bufp) + member_offset; - (void)memmove(modex_bufp, tmp_ds, sizeof(*tmp_ds)); - free(tmp_ds); - free(tmp_bufp); - } - else { /* for mid 2 */ - (void)snprintf(key, key_len, "%s-%d", modex_comp_name, member_id); - rc = ompi_modex_recv_string((const char *)key, proc_node_rank_zero, - &tmp_bufp, &segment_size); - if (OMPI_SUCCESS != rc) { - /* rc is set */ - goto out; - } - modex_bufp = ((unsigned char *)*sm_modex_bufp) + member_offset; - (void)memmove(modex_bufp, tmp_bufp, segment_size); - free(tmp_bufp); - } - } - -out: - if (NULL != modex_comp_name) { - free(modex_comp_name); - } - if (NULL != key) { - free(key); - } - if (NULL != proc_world) { - free(proc_world); - } - if (OMPI_SUCCESS != rc && NULL != *sm_modex_bufp) { - free(*sm_modex_bufp); - *sm_modex_bufp = NULL; - } - return rc; -} -/* - * Modex receive. Caller is responsible for freeing returned resources. - */ -static inline int -recv_modex(mca_btl_sm_component_t *comp_ptr, - mca_btl_sm_modex_t **out_modex) -{ - int rc; - - if (OMPI_SUCCESS != (rc = do_segmented_modex_recv(comp_ptr, out_modex))) { - opal_output(0, "recv_modex: do_segmented_modex_recv failure!\n"); - } - return rc; -} - -static int -sm_btl_first_time_init(mca_btl_sm_t *sm_btl, - int32_t my_smp_rank, - int n) -{ - size_t length, length_payload; + size_t size, length, length_payload; + char *sm_ctl_file; sm_fifo_t *my_fifos; - int my_mem_node, num_mem_nodes, i, rc; - mca_mpool_base_resources_t *res = NULL; + int my_mem_node, num_mem_nodes, i; + ompi_proc_t **procs; + size_t num_procs; + mca_mpool_base_resources_t res; mca_btl_sm_component_t* m = &mca_btl_sm_component; - mca_btl_sm_modex_t *modex = NULL; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; @@ -338,43 +190,50 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, } #endif - if (NULL == (res = calloc(1, sizeof(*res)))) { + /* lookup shared memory pool */ + mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes, + sizeof(mca_mpool_base_module_t*)); + + /* Disable memory binding, because each MPI process will claim + pages in the mpool for their local NUMA node */ + res.mem_node = -1; + + /* determine how much memory to create */ + /* + * This heuristic formula mostly says that we request memory for: + * - nfifos FIFOs, each comprising: + * . a sm_fifo_t structure + * . many pointers (fifo_size of them per FIFO) + * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks) + * - max fragments (sm_free_list_num of them) + * + * On top of all that, we sprinkle in some number of + * "opal_cache_line_size" additions to account for some + * padding and edge effects that may lie in the allocator. + */ + res.size = + FIFO_MAP_NUM(n) * ( sizeof(sm_fifo_t) + sizeof(void *) * m->fifo_size + 4 * opal_cache_line_size ) + + ( 2 * n + m->sm_free_list_inc ) * ( m->eager_limit + 2 * opal_cache_line_size ) + + m->sm_free_list_num * ( m->max_frag_size + 2 * opal_cache_line_size ); + + /* before we multiply by n, make sure the result won't overflow */ + /* Stick that little pad in, particularly since we'll eventually + * need a little extra space. E.g., in mca_mpool_sm_init() in + * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is + * added. + */ + if ( ((double) res.size) * n > LONG_MAX - 4096 ) { return OMPI_ERR_OUT_OF_RESOURCE; } - /* everyone receive modex information. all but node rank zero attach to the - * segments stored within the modex. remember: node rank zero is already - * attached to sm_seg. */ - if (OMPI_SUCCESS != (rc = recv_modex(m, &modex))) { - free(res); - return rc; - } - /* lookup shared memory pool */ - mca_btl_sm_component.sm_mpools = - (mca_mpool_base_module_t **)calloc(num_mem_nodes, - sizeof(mca_mpool_base_module_t *)); - - /* Disable memory binding, because each MPI process will claim pages in the - * mpool for their local NUMA node */ - res->mem_node = -1; - res->size = modex->mpool_res_size; - - /* copy mpool's modex info into its base resources */ - if (OPAL_SUCCESS != - opal_shmem_ds_copy(&(modex->sm_mpool_meta_buf), - &(res->bs_meta_buf))) { - free(res); - free(modex); - return OMPI_ERROR; - } - /* now that res is fully populated, create the thing */ + res.size *= n; + + /* now, create it */ mca_btl_sm_component.sm_mpools[0] = mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, - sm_btl, res); + sm_btl, &res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_sm_component.sm_mpools[0]) { - free(res); - free(modex); - return OMPI_ERR_OUT_OF_RESOURCE; + return OMPI_ERR_OUT_OF_RESOURCE; } mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0]; @@ -386,27 +245,37 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**) calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == mca_btl_sm_component.sm_peers) { - free(res); - free(modex); return OMPI_ERR_OUT_OF_RESOURCE; } - if (0 != my_smp_rank) { - if (NULL == (mca_btl_sm_component.sm_seg = - mca_common_sm_module_attach(&modex->sm_meta_buf, - sizeof(mca_common_sm_seg_header_t), - opal_cache_line_size))) { - /* don't have to detach here, because module_attach cleans up after - * itself on failure. */ - opal_output(0, "sm_btl_first_time_init: " - "mca_common_sm_module_attach failure!\n"); - free(modex); - free(res); - return OMPI_ERROR; - } + + /* Allocate Shared Memory BTL process coordination + * data structure. This will reside in shared memory */ + + /* set file name */ + if (asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename) < 0) { + return OMPI_ERR_OUT_OF_RESOURCE; } - /* it is now safe to free the modex and the mpool resources */ - free(modex); - free(res); + + /* Pass in a data segment alignment of 0 to get no data + segment (only the shared control structure) */ + size = sizeof(mca_common_sm_seg_header_t) + + n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size; + procs = ompi_proc_world(&num_procs); + if (!(mca_btl_sm_component.sm_seg = + mca_common_sm_init(procs, num_procs, size, sm_ctl_file, + sizeof(mca_common_sm_seg_header_t), + opal_cache_line_size))) { + opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory " + "BTL coordinating strucure :: size %lu \n", + (unsigned long)size); + free(procs); + free(sm_ctl_file); + return OMPI_ERROR; + } + free(procs); + free(sm_ctl_file); /* check to make sure number of local procs is within the * specified limits */ @@ -505,7 +374,6 @@ static struct mca_btl_base_endpoint_t * create_sm_endpoint(int local_proc, struct ompi_proc_t *proc) { struct mca_btl_base_endpoint_t *ep; - #if OMPI_ENABLE_PROGRESS_THREADS == 1 char path[PATH_MAX]; #endif @@ -533,6 +401,22 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc) return ep; } +static void calc_sm_max_procs(int n) +{ + /* see if need to allocate space for extra procs */ + if(0 > mca_btl_sm_component.sm_max_procs) { + /* no limit */ + if(0 <= mca_btl_sm_component.sm_extra_procs) { + /* limit */ + mca_btl_sm_component.sm_max_procs = + n + mca_btl_sm_component.sm_extra_procs; + } else { + /* no limit */ + mca_btl_sm_component.sm_max_procs = 2 * n; + } + } +} + int mca_btl_sm_add_procs( struct mca_btl_base_module_t* btl, size_t nprocs, @@ -546,9 +430,6 @@ int mca_btl_sm_add_procs( mca_btl_sm_t *sm_btl; bool have_connected_peer = false; char **bases; - /* for easy access to the mpool_sm_module */ - mca_mpool_sm_module_t *sm_mpool_modp = NULL; - /* initializion */ sm_btl = (mca_btl_sm_t *)btl; @@ -561,7 +442,7 @@ int mca_btl_sm_add_procs( * and idetify procs that are on this host. Add procs on this * host to shared memory reachbility list. Also, get number * of local procs in the procs list. */ - for (proc = 0; proc < (int32_t)nprocs; proc++) { + for(proc = 0; proc < (int32_t)nprocs; proc++) { /* check to see if this proc can be reached via shmem (i.e., if they're on my local host and in my job) */ if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid || @@ -596,18 +477,18 @@ int mca_btl_sm_add_procs( goto CLEANUP; /* make sure that my_smp_rank has been defined */ - if (-1 == my_smp_rank) { + if(-1 == my_smp_rank) { return_code = OMPI_ERROR; goto CLEANUP; } + calc_sm_max_procs(n_local_procs); + if (!sm_btl->btl_inited) { return_code = - sm_btl_first_time_init(sm_btl, my_smp_rank, - mca_btl_sm_component.sm_max_procs); - if (return_code != OMPI_SUCCESS) { + sm_btl_first_time_init(sm_btl, mca_btl_sm_component.sm_max_procs); + if(return_code != OMPI_SUCCESS) goto CLEANUP; - } } /* set local proc's smp rank in the peers structure for @@ -620,7 +501,6 @@ int mca_btl_sm_add_procs( } bases = mca_btl_sm_component.shm_bases; - sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool; /* initialize own FIFOs */ /* @@ -644,34 +524,13 @@ int mca_btl_sm_add_procs( /* Sync with other local procs. Force the FIFO initialization to always * happens before the readers access it. */ - opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); + opal_atomic_add_32( &mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); while( n_local_procs > mca_btl_sm_component.sm_seg->module_seg->seg_inited) { opal_progress(); opal_atomic_rmb(); } - /* it is now safe to unlink the shared memory segment. only one process - * needs to do this, so just let smp rank zero take care of it. */ - if (0 == my_smp_rank) { - if (OMPI_SUCCESS != - mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) { - /* it is "okay" if this fails at this point. we have gone this far, - * so just warn about the failure and continue. this is probably - * only triggered by a programming error. */ - opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); - } - /* SKG - another abstraction violation here, but I don't want to add - * extra code in the sm mpool for further synchronization. */ - - /* at this point, all processes have attached to the mpool segment. so - * it is safe to unlink it here. */ - if (OMPI_SUCCESS != - mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) { - opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); - } - } - /* coordinate with other processes */ for(j = mca_btl_sm_component.num_smp_procs; j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) { diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index 07be4cd53c..2a02f543b4 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -42,8 +42,6 @@ #include "opal/util/bit_ops.h" #include "opal/class/opal_free_list.h" -#include "opal/mca/shmem/shmem.h" - #include "ompi/mca/btl/btl.h" #include "ompi/mca/common/sm/common_sm.h" @@ -85,10 +83,6 @@ BEGIN_C_DECLS line that should hopefully be good in most places. */ #define SM_CACHE_LINE_PAD 128 -/* number of members in mca_btl_sm_modex_t */ -#define SM_MODEX_NUM_MEMBERS 3 -#define SM_MODEX_STR_PAD 32 - struct sm_fifo_t { /* This queue pointer is used only by the heads. */ volatile void **queue; @@ -127,58 +121,6 @@ typedef struct mca_btl_sm_mem_node_t { mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */ } mca_btl_sm_mem_node_t; -/** - * Shared Memory (SM) BTL modex. - * Please update SM_MODEX_NUM_MEMBERS if the number of members ever changes. - */ -struct mca_btl_sm_modex_t { - /* 0 */ - opal_shmem_ds_t sm_meta_buf; - /* 1 */ - opal_shmem_ds_t sm_mpool_meta_buf; - /* 2 */ - size_t mpool_res_size; -}; -typedef struct mca_btl_sm_modex_t mca_btl_sm_modex_t; - -static inline int -mca_btl_sm_get_modex_member_off_n_size(const mca_btl_sm_modex_t *bp, - int mid, size_t *out_off, - size_t *out_size) { - switch (mid) { - /* sm_meta_buf */ - case 0: - if (NULL != out_off) { - *out_off = offsetof(mca_btl_sm_modex_t, sm_meta_buf); - } - if (NULL != out_size) { - *out_size = opal_shmem_sizeof_shmem_ds(&bp->sm_meta_buf); - } - break; - /* sm_mpool_meta_buf */ - case 1: - if (NULL != out_off) { - *out_off = offsetof(mca_btl_sm_modex_t, sm_mpool_meta_buf); - } - if (NULL != out_size) { - *out_size = opal_shmem_sizeof_shmem_ds(&bp->sm_mpool_meta_buf); - } - break; - case 2: - /* mpool_res_size */ - if (NULL != out_off) { - *out_off = offsetof(mca_btl_sm_modex_t, mpool_res_size); - } - if (NULL != out_size) { - *out_size = sizeof(bp->mpool_res_size); - } - break; - default: - return OMPI_ERR_VALUE_OUT_OF_BOUNDS; - } - return OMPI_SUCCESS; -} - /** * Shared Memory (SM) BTL module. */ diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index 20d06cedb7..9b66830c5a 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010-2011 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. @@ -42,18 +42,14 @@ #include /* for mkfifo */ #endif /* HAVE_SYS_STAT_H */ -#include "opal/mca/base/mca_base_param.h" -#include "opal/mca/shmem/base/base.h" -#include "opal/mca/shmem/shmem.h" +#include "ompi/constants.h" #include "opal/util/bit_ops.h" #include "opal/util/output.h" - +#include "orte/util/proc_info.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" -#include "orte/util/proc_info.h" -#include "ompi/constants.h" -#include "ompi/runtime/ompi_module_exchange.h" +#include "opal/mca/base/mca_base_param.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/btl/base/btl_base_error.h" @@ -360,450 +356,52 @@ CLEANUP: return return_value; } -/* - * Returns the number of processes on the node. - */ -static inline int -get_num_local_procs(void) -{ - /* num_local_peers does not include us in - * its calculation, so adjust for that */ - return (int)(1 + orte_process_info.num_local_peers); -} - -static void -calc_sm_max_procs(int n) -{ - /* see if need to allocate space for extra procs */ - if (0 > mca_btl_sm_component.sm_max_procs) { - /* no limit */ - if (0 <= mca_btl_sm_component.sm_extra_procs) { - /* limit */ - mca_btl_sm_component.sm_max_procs = - n + mca_btl_sm_component.sm_extra_procs; - } else { - /* no limit */ - mca_btl_sm_component.sm_max_procs = 2 * n; - } - } -} - -static int -create_and_attach(mca_btl_sm_component_t *comp_ptr, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment, - mca_common_sm_module_t **out_modp) - -{ - if (NULL == (*out_modp = - mca_common_sm_module_create_and_attach(size, file_name, - size_ctl_structure, - data_seg_alignment))) { - opal_output(0, "create_and_attach: unable to create shared memory " - "BTL coordinating strucure :: size %lu \n", - (unsigned long)size); - return OMPI_ERROR; - } - return OMPI_SUCCESS; -} - -/* - * SKG - I'm not happy with this, but I can't figure out a better way of - * finding the sm mpool's minimum size 8-|. The way I see it. This BTL only - * uses the sm mpool, so maybe this isn't so bad... - * - * The problem is the we need to size the mpool resources at sm BTL component - * init. That means we need to know the mpool's minimum size at create. - */ -static int -get_min_mpool_size(mca_btl_sm_component_t *comp_ptr, - size_t *out_size) -{ - char *type_name = "mpool"; - char *param_name = "min_size"; - char *min_size = NULL; - int id = 0; - size_t default_min = 67108864; - size_t size = 0; - long tmp_size = 0; - - if (0 > (id = mca_base_param_find(type_name, comp_ptr->sm_mpool_name, - param_name))) { - opal_output(0, "mca_base_param_find: failure looking for %s_%s_%s\n", - type_name, comp_ptr->sm_mpool_name, param_name); - return OMPI_ERR_NOT_FOUND; - } - if (OPAL_ERROR == mca_base_param_lookup_string(id, &min_size)) { - opal_output(0, "mca_base_param_lookup_string failure\n"); - return OMPI_ERROR; - } - errno = 0; - tmp_size = strtol(min_size, (char **)NULL, 10); - if (ERANGE == errno || EINVAL == errno || tmp_size <= 0) { - opal_output(0, "mca_btl_sm::get_min_mpool_size: " - "Unusable %s_%s_min_size provided. " - "Continuing with %lu.", type_name, - comp_ptr->sm_mpool_name, - (unsigned long)default_min); - - size = default_min; - } - else { - size = (size_t)tmp_size; - } - free(min_size); - *out_size = size; - return OMPI_SUCCESS; -} - -static int -get_mpool_res_size(int32_t max_procs, - size_t *out_res_size) -{ - size_t size = 0; - /* determine how much memory to create */ - /* - * This heuristic formula mostly says that we request memory for: - * - nfifos FIFOs, each comprising: - * . a sm_fifo_t structure - * . many pointers (fifo_size of them per FIFO) - * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks) - * - max fragments (sm_free_list_num of them) - * - * On top of all that, we sprinkle in some number of - * "opal_cache_line_size" additions to account for some - * padding and edge effects that may lie in the allocator. - */ - size = FIFO_MAP_NUM(max_procs) * - (sizeof(sm_fifo_t) + sizeof(void *) * - mca_btl_sm_component.fifo_size + 4 * opal_cache_line_size) + - (2 * max_procs + mca_btl_sm_component.sm_free_list_inc) * - (mca_btl_sm_component.eager_limit + 2 * opal_cache_line_size) + - mca_btl_sm_component.sm_free_list_num * - (mca_btl_sm_component.max_frag_size + 2 * opal_cache_line_size); - - /* add something for the control structure */ - size += sizeof(mca_common_sm_module_t); - - /* before we multiply by max_procs, make sure the result won't overflow */ - /* Stick that little pad in, particularly since we'll eventually - * need a little extra space. E.g., in mca_mpool_sm_init() in - * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is - * added. - */ - if (((double)size) * max_procs > LONG_MAX - 4096) { - return OMPI_ERR_VALUE_OUT_OF_BOUNDS; - } - size *= (size_t)max_procs; - *out_res_size = size; - return OMPI_SUCCESS; -} - -/* - * Creates the shared-memory segments required for this BTL. One for the sm - * mpool and another for the shared memory store and populates *modex_buf_ptr. - * - * it is assumed that calc_sm_max_procs has already been called (sets - * sm_max_procs). - */ -static int -populate_modex_bufp(mca_btl_sm_component_t *comp_ptr, - mca_btl_sm_modex_t *modex_buf_ptr) -{ - int rc = OMPI_SUCCESS; - size_t size = 0; - size_t min_size = 0; - char *sm_mpool_ctl_file = NULL; - char *sm_ctl_file = NULL; - /* used as a temporary store so we can extract shmem_ds info */ - mca_common_sm_module_t *tmp_modp = NULL; - - /* first generate some unique paths for the shared-memory segments that - * this BTL needs. */ - if (asprintf(&sm_mpool_ctl_file, - "%s"OPAL_PATH_SEP"shared_mem_pool.%s", - orte_process_info.job_session_dir, - orte_process_info.nodename) < 0) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - if (asprintf(&sm_ctl_file, - "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", - orte_process_info.job_session_dir, - orte_process_info.nodename) < 0) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - - /* create the things */ - - /* === sm mpool === */ - /* get the segment size for the sm mpool. */ - if (OMPI_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs, - &size))) { - /* rc is already set */ - goto out; - } - /* do we need to update the size based on the sm mpool's min size? */ - if (OMPI_SUCCESS != (rc = get_min_mpool_size(comp_ptr, &min_size))) { - goto out; - } - if (size < min_size) { - size = min_size; - } - /* we only need the shmem_ds info at this point. initilization will be - * completed in the mpool module code. the idea is that we just need this - * info so we can populate the modex. */ - if (OMPI_SUCCESS != (rc = - create_and_attach(comp_ptr, size, sm_mpool_ctl_file, - sizeof(mca_common_sm_module_t), 8, &tmp_modp))) { - /* rc is set */ - goto out; - } - /* now extract and store the shmem_ds info from the returned module */ - if (OPAL_SUCCESS != - opal_shmem_ds_copy(&(tmp_modp->shmem_ds), - &(modex_buf_ptr->sm_mpool_meta_buf))) { - rc = OMPI_ERROR; - goto out; - } - /* set the mpool_res_size in the modex */ - modex_buf_ptr->mpool_res_size = size; - - /* === sm btl === */ - /* calculate the segment size. */ - size = sizeof(mca_common_sm_seg_header_t) + - comp_ptr->sm_max_procs * - (sizeof(sm_fifo_t *) + - sizeof(char *) + sizeof(uint16_t)) + - opal_cache_line_size; - - if (OMPI_SUCCESS != (rc = - create_and_attach(comp_ptr, size, sm_ctl_file, - sizeof(mca_common_sm_seg_header_t), - opal_cache_line_size, &comp_ptr->sm_seg))) { - /* rc is set */ - goto out; - } - /* now extract and store the shmem_ds info from the returned module */ - if (OPAL_SUCCESS != opal_shmem_ds_copy(&(comp_ptr->sm_seg->shmem_ds), - &(modex_buf_ptr->sm_meta_buf))) { - rc = OMPI_ERROR; - goto out; - } - -out: - if (NULL != sm_mpool_ctl_file) { - free(sm_mpool_ctl_file); - } - if (NULL != sm_ctl_file) { - free(sm_ctl_file); - } - return rc; -} - -static int -send_member(char *key_prefix, - unsigned char *member_basep, - size_t extent, - int member_id) -{ - char *key = NULL; - int rc = OMPI_ERROR; - size_t shmem_path_offset = 0; - - switch (member_id) { - case 0: - case 1: - shmem_path_offset = offsetof(opal_shmem_ds_t, seg_name); - if (-1 == asprintf(&key, "%s-%d", key_prefix, 0)) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - if (OMPI_SUCCESS != (rc = - ompi_modex_send_string((const char *)key, - member_basep, shmem_path_offset))) { - free(key); - return rc; - } - free(key); - if (-1 == asprintf(&key, "%s-%d", key_prefix, 1)) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - /* using ompi_modex_send_key_value here, so the data isn't encoded - * if using PMI grpcomm. */ - if (OMPI_SUCCESS != (rc = - ompi_modex_send_key_value(key, - (member_basep + shmem_path_offset), - OPAL_STRING))) { - free(key); - return rc; - } - free(key); - return OMPI_SUCCESS; - case 2: - if (OMPI_SUCCESS != (rc = - ompi_modex_send_string((const char *)key_prefix, - member_basep, extent))) { - free(key); - return rc; - } - return OMPI_SUCCESS; - default: - return OMPI_ERR_VALUE_OUT_OF_BOUNDS; - } - return OMPI_ERROR; -} - -static int -send_all_modex_members(mca_btl_sm_component_t *comp_ptr, - mca_btl_sm_modex_t *modex_bufp) -{ - size_t offset = 0, extent = 0; - unsigned char *datap = (unsigned char *)modex_bufp; - unsigned char *tmp_base = NULL; - char *modex_comp_name = NULL; - int rc, mid; - char *key; - - if (NULL == (modex_comp_name = - mca_base_component_to_string(&comp_ptr->super.btl_version))) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - /* iterate over all the modex members and pack the data into one message - * buffer */ - for (mid = 0; mid < SM_MODEX_NUM_MEMBERS; ++mid) { - if (OMPI_SUCCESS != (rc = - mca_btl_sm_get_modex_member_off_n_size(modex_bufp, mid, - &offset, &extent))) { - /* rc is set */ - goto out; - } - tmp_base = (unsigned char *)datap + offset; - if (-1 == asprintf(&key, "%s-%d", modex_comp_name, mid)) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto out; - } - if (OMPI_SUCCESS != (rc = send_member(key, tmp_base, extent, mid))) { - free(key); - goto out; - } - free(key); - } - -out: - if (NULL != modex_comp_name) { - free(modex_comp_name); - } - return rc; -} - -/* - * Creates information required for the sm modex and modex sends it. - */ -static int -send_modex(mca_btl_sm_component_t *comp_ptr, - orte_node_rank_t node_rank) -{ - int rc = OMPI_SUCCESS; - mca_btl_sm_modex_t *sm_modex = NULL; - - /* only node rank zero needs to send modex info */ - if (0 != node_rank) { - return OMPI_SUCCESS; - } - if (NULL == (sm_modex = calloc(1, sizeof(*sm_modex)))) { - /* out of resources, so just bail. */ - return OMPI_ERR_OUT_OF_RESOURCE; - } - if (OMPI_SUCCESS != (rc = populate_modex_bufp(comp_ptr, sm_modex))) { - opal_output(0, "send_modex: populate_modex_bufp failure!\n"); - /* rc is set */ - goto out; - } - rc = send_all_modex_members(comp_ptr, sm_modex); - -out: - if (NULL != sm_modex) { - free(sm_modex); - } - return rc; -} - /* * SM component initialization */ -static mca_btl_base_module_t ** -mca_btl_sm_component_init(int *num_btls, - bool enable_progress_threads, - bool enable_mpi_threads) +static mca_btl_base_module_t** mca_btl_sm_component_init( + int *num_btls, + bool enable_progress_threads, + bool enable_mpi_threads) { - int num_local_procs = 0; mca_btl_base_module_t **btls = NULL; - orte_node_rank_t my_node_rank = ORTE_NODE_RANK_INVALID; #if OMPI_BTL_SM_HAVE_KNEM int rc; #endif *num_btls = 0; + + /* if no session directory was created, then we cannot be used */ + if (!orte_create_session_dirs) { + return NULL; + } + /* lookup/create shared memory pool only when used */ mca_btl_sm_component.sm_mpool = NULL; mca_btl_sm_component.sm_mpool_base = NULL; - /* if no session directory was created, then we cannot be used */ - /* SKG - this isn't true anymore. Some backing facilities don't require a - * file-backed store. Extend shmem to provide this info one day. */ - if (!orte_create_session_dirs) { - return NULL; - } - /* if we don't have locality information, then we cannot be used */ - if (ORTE_NODE_RANK_INVALID == - (my_node_rank = orte_process_info.my_node_rank)) { - orte_show_help("help-mpi-btl-sm.txt", "no locality", true); - return NULL; - } - /* no use trying to use sm with less than two procs, so just bail. */ - if ((num_local_procs = get_num_local_procs()) < 2) { - return NULL; - } - /* calculate max procs so we can figure out how large to make the - * shared-memory segment. this routine sets component sm_max_procs. */ - calc_sm_max_procs(num_local_procs); - - if (OMPI_SUCCESS != send_modex(&mca_btl_sm_component, my_node_rank)) { - return NULL; - } - #if OMPI_ENABLE_PROGRESS_THREADS == 1 /* create a named pipe to receive events */ - sprintf(mca_btl_sm_component.sm_fifo_path, - "%s"OPAL_PATH_SEP"sm_fifo.%lu", - orte_process_info.job_session_dir, - (unsigned long)ORTE_PROC_MY_NAME->vpid); - if (mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) { - opal_output(0, "mca_btl_sm_component_init: " - "mkfifo failed with errno=%d\n",errno); + sprintf( mca_btl_sm_component.sm_fifo_path, + "%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir, + (unsigned long)ORTE_PROC_MY_NAME->vpid ); + if(mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) { + opal_output(0, "mca_btl_sm_component_init: mkfifo failed with errno=%d\n",errno); return NULL; } - mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, - O_RDWR); + mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, O_RDWR); if(mca_btl_sm_component.sm_fifo_fd < 0) { - opal_output(0, "mca_btl_sm_component_init: " - "open(%s) failed with errno=%d\n", + opal_output(0, "mca_btl_sm_component_init: open(%s) failed with errno=%d\n", mca_btl_sm_component.sm_fifo_path, errno); return NULL; } OBJ_CONSTRUCT(&mca_btl_sm_component.sm_fifo_thread, opal_thread_t); - mca_btl_sm_component.sm_fifo_thread.t_run = - (opal_thread_fn_t)mca_btl_sm_component_event_thread; + mca_btl_sm_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_sm_component_event_thread; opal_thread_start(&mca_btl_sm_component.sm_fifo_thread); #endif - mca_btl_sm_component.sm_btls = - (mca_btl_sm_t **)malloc(mca_btl_sm_component.sm_max_btls * - sizeof(mca_btl_sm_t *)); + mca_btl_sm_component.sm_btls = (mca_btl_sm_t **) malloc( mca_btl_sm_component.sm_max_btls * sizeof (mca_btl_sm_t *)); if (NULL == mca_btl_sm_component.sm_btls) { return NULL; } diff --git a/ompi/mca/btl/sm/help-mpi-btl-sm.txt b/ompi/mca/btl/sm/help-mpi-btl-sm.txt index 9d868e8445..b6905097d6 100644 --- a/ompi/mca/btl/sm/help-mpi-btl-sm.txt +++ b/ompi/mca/btl/sm/help-mpi-btl-sm.txt @@ -4,8 +4,6 @@ # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2012 Los Alamos National Security, LLC. -# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -14,10 +12,6 @@ # # This is the US/English help file for Open MPI's shared memory support. # -[no locality] -WARNING: Missing locality information required for sm initialization. -Continuing without shared memory support. -# [knem requested but not supported] WARNING: Linux kernel knem support was requested for the shared memory (sm) BTL, but it is not supported. Deactivating the shared memory diff --git a/ompi/mca/common/sm/common_sm.c b/ompi/mca/common/sm/common_sm.c index 74dfdead43..889d9c3faa 100644 --- a/ompi/mca/common/sm/common_sm.c +++ b/ompi/mca/common/sm/common_sm.c @@ -42,7 +42,6 @@ #include "opal/align.h" #include "opal/util/argv.h" -#include "opal/mca/shmem/shmem.h" #if OPAL_ENABLE_FT_CR == 1 #include "opal/runtime/opal_cr.h" #endif @@ -134,7 +133,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, map->module_data_addr = addr; map->module_seg_addr = (unsigned char *)seg; - + /* note that size is only used during the first call */ if (first_call) { /* initialize some segment information */ @@ -158,20 +157,20 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, } /* ////////////////////////////////////////////////////////////////////////// */ -/* api implementation */ +/* api implementation */ /* ////////////////////////////////////////////////////////////////////////// */ /* ////////////////////////////////////////////////////////////////////////// */ mca_common_sm_module_t * -mca_common_sm_module_create_and_attach(size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +mca_common_sm_module_create(size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) { mca_common_sm_module_t *map = NULL; opal_shmem_ds_t *seg_meta = NULL; - if (NULL == (seg_meta = (opal_shmem_ds_t *)malloc(sizeof(*seg_meta)))) { + if (NULL == (seg_meta = (opal_shmem_ds_t *) malloc(sizeof(*seg_meta)))) { /* out of resources */ return NULL; } @@ -198,39 +197,33 @@ mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta, size_t size_ctl_structure, size_t data_seg_alignment) { + mca_common_sm_module_t *map = NULL; + /* notice that size is 0 here. it really doesn't matter because size WILL * NOT be used because this is an attach (first_call is false). */ - return attach_and_init(seg_meta, 0, size_ctl_structure, - data_seg_alignment, false); + map = attach_and_init(seg_meta, 0, size_ctl_structure, + data_seg_alignment, false); + + return map; } /* ////////////////////////////////////////////////////////////////////////// */ -int -mca_common_sm_module_unlink(mca_common_sm_module_t *modp) +mca_common_sm_module_t * +mca_common_sm_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) { - if (NULL == modp) { - return OMPI_ERROR; - } - if (OPAL_SUCCESS != opal_shmem_unlink(&modp->shmem_ds)) { - return OMPI_ERROR; - } - return OMPI_SUCCESS; -} - -/* ////////////////////////////////////////////////////////////////////////// */ -int -mca_common_sm_local_proc_reorder(ompi_proc_t **procs, - size_t num_procs, - size_t *out_num_local_procs) -{ - size_t num_local_procs = 0; - bool found_lowest = false; + /* indicates whether or not i'm the lowest named process */ + bool lowest_local_proc = false; + mca_common_sm_module_t *map = NULL; ompi_proc_t *temp_proc = NULL; - size_t p; + bool found_lowest = false; + size_t num_local_procs = 0, p = 0; + opal_shmem_ds_t *seg_meta = NULL; - if (NULL == out_num_local_procs || NULL == procs) { - return OMPI_ERR_BAD_PARAM; - } /* o reorder procs array to have all the local procs at the beginning. * o look for the local proc with the lowest name. * o determine the number of local procs. @@ -247,7 +240,8 @@ mca_common_sm_local_proc_reorder(ompi_proc_t **procs, /* save this proc */ procs[num_local_procs] = procs[p]; /* if we have a new lowest, swap it with position 0 - * so that procs[0] is always the lowest named proc */ + * so that procs[0] is always the lowest named proc + */ if (OPAL_VALUE2_GREATER == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(procs[p]->proc_name), @@ -263,31 +257,6 @@ mca_common_sm_local_proc_reorder(ompi_proc_t **procs, ++num_local_procs; } } - *out_num_local_procs = num_local_procs; - - return OMPI_SUCCESS; -} - -/* ////////////////////////////////////////////////////////////////////////// */ -mca_common_sm_module_t * -mca_common_sm_init(ompi_proc_t **procs, - size_t num_procs, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) -{ - /* indicates whether or not i'm the lowest named process */ - bool lowest_local_proc = false; - mca_common_sm_module_t *map = NULL; - size_t num_local_procs = 0; - opal_shmem_ds_t *seg_meta = NULL; - - if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(procs, - num_procs, - &num_local_procs)) { - return NULL; - } /* if there is less than 2 local processes, there's nothing to do. */ if (num_local_procs < 2) { @@ -301,9 +270,9 @@ mca_common_sm_init(ompi_proc_t **procs, /* determine whether or not i am the lowest local process */ lowest_local_proc = - (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, - ORTE_PROC_MY_NAME, - &(procs[0]->proc_name))); + (0 == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, + ORTE_PROC_MY_NAME, + &(procs[0]->proc_name))); /* figure out if i am the lowest rank in the group. * if so, i will create the shared memory backing store @@ -465,3 +434,4 @@ mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module) } return rc; } + diff --git a/ompi/mca/common/sm/common_sm.h b/ompi/mca/common/sm/common_sm.h index c916cc603c..b8fd007e1e 100644 --- a/ompi/mca/common/sm/common_sm.h +++ b/ompi/mca/common/sm/common_sm.h @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010-2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -73,31 +73,18 @@ typedef struct mca_common_sm_module_t { OBJ_CLASS_DECLARATION(mca_common_sm_module_t); /** - * This routine reorders procs array to have all the local procs at the - * beginning and returns the number of local procs through out_num_local_procs. - * The proc with the lowest name is at the beginning of the reordered procs - * array. - * - * @returnvalue OMPI_SUCCESS on success, something else, otherwise. - */ -OMPI_DECLSPEC extern int -mca_common_sm_local_proc_reorder(ompi_proc_t **procs, - size_t num_procs, - size_t *out_num_local_procs); - -/** - * This routine is used to create and attach to a shared memory segment - * (whether it's an mmaped file or a SYSV IPC segment). It is assumed that + * This routine is used to create a shared memory segment (whether + * it's an mmaped file or a SYSV IPC segment). It is assumed that * the shared memory segment does not exist before this call. * * @returnvalue pointer to control structure at head of shared memory segment. * Returns NULL if an error occurred. */ -OMPI_DECLSPEC extern mca_common_sm_module_t * -mca_common_sm_module_create_and_attach(size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment); +mca_common_sm_module_t * +mca_common_sm_module_create(size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); /** * This routine is used to attach to the shared memory segment associated with @@ -109,22 +96,11 @@ mca_common_sm_module_create_and_attach(size_t size, * @returnvalue pointer to control structure at head of shared memory segment. * Returns NULL if an error occurred. */ -OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_module_t * mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta, size_t size_ctl_structure, size_t data_seg_alignment); -/** - * A thin wrapper around opal_shmem_unlink. - * - * @ modp points to an initialized mca_common_sm_module_t. - * - * @returnvalue OMPI_SUCCESS if the operation completed successfully, - * OMPI_ERROR otherwise. - */ -OMPI_DECLSPEC extern int -mca_common_sm_module_unlink(mca_common_sm_module_t *modp); - /** * This routine is used to set up a shared memory segment (whether * it's an mmaped file or a SYSV IPC segment). It is assumed that @@ -188,7 +164,7 @@ mca_common_sm_init_group(ompi_group_t *group, */ OMPI_DECLSPEC extern void * mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, - size_t *size, + size_t* size, mca_mpool_base_registration_t **registration); /** @@ -213,3 +189,4 @@ OMPI_DECLSPEC extern mca_common_sm_module_t *mca_common_sm_module; END_C_DECLS #endif /* _COMMON_SM_H_ */ + diff --git a/ompi/mca/mpool/sm/mpool_sm.h b/ompi/mca/mpool/sm/mpool_sm.h index 9666b3b63f..b46bc044d5 100644 --- a/ompi/mca/mpool/sm/mpool_sm.h +++ b/ompi/mca/mpool/sm/mpool_sm.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -28,7 +28,6 @@ #include "ompi_config.h" #include "opal/mca/event/event.h" -#include "opal/mca/shmem/shmem.h" #include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/mpool/mpool.h" @@ -37,19 +36,17 @@ BEGIN_C_DECLS struct mca_mpool_sm_component_t { - mca_mpool_base_component_t super; - /* mca_allocator_base_module_t* sm_allocator; */ - char *sm_allocator_name; - int verbose; - /* struct mca_mpool_sm_mmap_t *sm_mmap; */ + mca_mpool_base_component_t super; + /* mca_allocator_base_module_t* sm_allocator; */ + char* sm_allocator_name; + int verbose; + /* struct mca_mpool_sm_mmap_t *sm_mmap; */ }; typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t; typedef struct mca_mpool_base_resources_t { size_t size; int32_t mem_node; - /* backing store metadata */ - opal_shmem_ds_t bs_meta_buf; } mca_mpool_base_resources_t; OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; @@ -57,7 +54,7 @@ OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; typedef struct mca_mpool_sm_module_t { mca_mpool_base_module_t super; long sm_size; - mca_allocator_base_module_t *sm_allocator; + mca_allocator_base_module_t * sm_allocator; struct mca_mpool_sm_mmap_t *sm_mmap; mca_common_sm_module_t *sm_common_module; int32_t mem_node; diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index e5f9ff3994..bccaf78e38 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -45,14 +45,10 @@ /* * Local functions */ -static int -mca_mpool_sm_open(void); - -static int -mca_mpool_sm_close(void); - -static mca_mpool_base_module_t * -mca_mpool_sm_init(struct mca_mpool_base_resources_t* resources); +static int mca_mpool_sm_open(void); +static int mca_mpool_sm_close( void ); +static mca_mpool_base_module_t* mca_mpool_sm_init( + struct mca_mpool_base_resources_t* resources); mca_mpool_sm_component_t mca_mpool_sm_component = { { @@ -94,8 +90,8 @@ static int mca_mpool_sm_open(void) /* register SM component parameters */ mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "allocator", - "Name of allocator component " - "to use with sm mpool", false, false, + "Name of allocator component to use with sm mpool", + false, false, "bucket", &mca_mpool_sm_component.sm_allocator_name); @@ -104,18 +100,18 @@ static int mca_mpool_sm_open(void) * to be set up to 2GB-1 for 32 bit and much greater for 64 bit. */ asprintf(&size_str, "%ld", default_min); mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, - "min_size", - "Minimum size of the sm mpool shared memory file", - false, false, size_str, &min_size_param); + "min_size", + "Minimum size of the sm mpool shared memory file", + false, false, size_str, &min_size_param); free(size_str); mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, - "verbose", - "Enable verbose output for mpool sm component", - false, false, 0, &value); + "verbose", + "Enable verbose output for mpool sm component", + false, false, 0, &value); if (value != 0) { - mca_mpool_sm_component.verbose = opal_output_open(NULL); + mca_mpool_sm_component.verbose = opal_output_open(NULL); } else { - mca_mpool_sm_component.verbose = -1; + mca_mpool_sm_component.verbose = -1; } return OMPI_SUCCESS; @@ -132,44 +128,41 @@ static int mca_mpool_sm_close( void ) return OMPI_SUCCESS; } -static mca_mpool_base_module_t * -mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) +static mca_mpool_base_module_t* mca_mpool_sm_init( + struct mca_mpool_base_resources_t* resources) { - mca_mpool_sm_module_t *mpool_module; + char *file_name; + int len; + mca_mpool_sm_module_t* mpool_module; mca_allocator_base_component_t* allocator_component; long min_size; ompi_proc_t **procs; size_t num_all_procs, i, num_local_procs = 0; /* README: this needs to change if procs in different jobs (even - * spawned ones) are to talk using shared memory */ - if (NULL == (procs = ompi_proc_world(&num_all_procs))) { - /* out of resources, so just bail */ - return NULL; - } + spawned ones) are to talk using shared memory */ + procs = ompi_proc_world(&num_all_procs); for (i = 0 ; i < num_all_procs ; ++i) { if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { num_local_procs++; } } + /* parse the min size and validate it */ - /* if other parameters are added, absolutely - * necessary to reset errno each time */ + /* if other parameters are added, absolutely necessary to reset errno each time */ errno = 0; min_size = strtol(min_size_param, (char **)NULL, 10); if (errno == ERANGE) { - opal_output(0, "mca_mpool_sm_init: min_size overflows! " - "set to default (%ld)", default_min); + opal_output(0, "mca_mpool_sm_init: min_size overflows! set to default (%ld)", default_min); min_size = default_min; } else if (errno == EINVAL) { - opal_output(0, "mca_mpool_sm_init: invalid min_size entered. " - "set it to (%ld)", default_min); + opal_output(0, "mca_mpool_sm_init: invalid min_size entered. set it to (%ld)", default_min); min_size = default_min; } /* Make a new mpool module */ mpool_module = - (mca_mpool_sm_module_t *)malloc(sizeof(mca_mpool_sm_module_t)); + (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t)); mca_mpool_sm_module_init(mpool_module); /* set sm_size */ @@ -180,26 +173,23 @@ mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) mpool_module->sm_size = min_size; } + /* add something for the control structure */ + mpool_module->sm_size += sizeof(mca_common_sm_module_t); + allocator_component = mca_allocator_component_lookup( mca_mpool_sm_component.sm_allocator_name); /* if specified allocator cannot be loaded - look for an alternative */ - if (NULL == allocator_component) { - if (opal_list_get_size(&mca_allocator_base_components) == 0) { - mca_base_component_list_item_t *item = - (mca_base_component_list_item_t *) + if(NULL == allocator_component) { + if(opal_list_get_size(&mca_allocator_base_components) == 0) { + mca_base_component_list_item_t* item = (mca_base_component_list_item_t*) opal_list_get_first(&mca_allocator_base_components); - allocator_component = - (mca_allocator_base_component_t *)item->cli_component; - opal_output( - 0, "mca_mpool_sm_init: " - "unable to locate allocator: %s - using %s\n", - mca_mpool_sm_component.sm_allocator_name, - allocator_component->allocator_version.mca_component_name); + allocator_component = (mca_allocator_base_component_t*)item->cli_component; + opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s - using %s\n", + mca_mpool_sm_component.sm_allocator_name, allocator_component->allocator_version.mca_component_name); } else { - opal_output(0, "mca_mpool_sm_init: " - "unable to locate allocator: %s\n", - mca_mpool_sm_component.sm_allocator_name); + opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s\n", + mca_mpool_sm_component.sm_allocator_name); free(procs); return NULL; } @@ -207,28 +197,41 @@ mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) mpool_module->mem_node = resources->mem_node; + /* create initial shared memory mapping */ + len = asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename ); + if ( 0 > len ) { + free(mpool_module); + free(procs); + return NULL; + } + opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: shared memory size used: (%ld)", mpool_module->sm_size); - if (NULL == (mpool_module->sm_common_module = - mca_common_sm_module_attach(&resources->bs_meta_buf, + if (NULL == (mpool_module->sm_common_module = + mca_common_sm_init(procs, num_all_procs, + mpool_module->sm_size, + file_name, sizeof(mca_common_sm_module_t), 8))) { - opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: " - "unable to create shared memory mapping (%s)", - resources->bs_meta_buf.seg_name); + opal_output(mca_mpool_sm_component.verbose, + "mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name); + free(file_name); free(mpool_module); free(procs); return NULL; } free(procs); + free(file_name); /* setup allocator */ mpool_module->sm_allocator = allocator_component->allocator_init(true, mca_common_sm_seg_alloc, NULL, &(mpool_module->super)); - if (NULL == mpool_module->sm_allocator) { + if(NULL == mpool_module->sm_allocator) { opal_output(0, "mca_mpool_sm_init: unable to initialize allocator"); free(mpool_module); return NULL; diff --git a/opal/mca/shmem/mmap/shmem_mmap_module.c b/opal/mca/shmem/mmap/shmem_mmap_module.c index c63d633987..23e0195a37 100644 --- a/opal/mca/shmem/mmap/shmem_mmap_module.c +++ b/opal/mca/shmem/mmap/shmem_mmap_module.c @@ -122,12 +122,13 @@ opal_shmem_mmap_module_t opal_shmem_mmap_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { - /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting\n", + "%s: %s: shmem_ds_resetting " + "(id: %d, size: %lu, name: %s)\n", mca_shmem_mmap_component.super.base_version.mca_type_name, - mca_shmem_mmap_component.super.base_version.mca_component_name) + mca_shmem_mmap_component.super.base_version.mca_component_name, + ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); ds_buf->seg_cpid = 0; diff --git a/opal/mca/shmem/posix/shmem_posix_module.c b/opal/mca/shmem/posix/shmem_posix_module.c index 0ef9cfe5a3..22789cafb0 100644 --- a/opal/mca/shmem/posix/shmem_posix_module.c +++ b/opal/mca/shmem/posix/shmem_posix_module.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010-2011 Los Alamos National Security, LLC. * All rights reserved. * * $COPYRIGHT$ @@ -108,12 +108,13 @@ opal_shmem_posix_module_t opal_shmem_posix_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { - /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting\n", + "%s: %s: shmem_ds_resetting " + "(id: %d, size: %lu, name: %s)\n", mca_shmem_posix_component.super.base_version.mca_type_name, - mca_shmem_posix_component.super.base_version.mca_component_name) + mca_shmem_posix_component.super.base_version.mca_component_name, + ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); ds_buf->seg_cpid = 0; diff --git a/opal/mca/shmem/shmem_types.h b/opal/mca/shmem/shmem_types.h index ee7f9b202d..b2e165b2be 100644 --- a/opal/mca/shmem/shmem_types.h +++ b/opal/mca/shmem/shmem_types.h @@ -12,7 +12,7 @@ * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010-2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -33,13 +33,6 @@ #include "opal_config.h" -#ifdef HAVE_STDDEF_H -#include -#endif /* HAVE_STDDEF_H */ -#ifdef HAVE_STRING_H -#include -#endif /* HAVE_STRING_H */ - BEGIN_C_DECLS /* ////////////////////////////////////////////////////////////////////////// */ @@ -99,6 +92,7 @@ do { \ #define OPAL_SHMEM_DS_IS_VALID(ds_buf) \ ( (ds_buf)->flags & OPAL_SHMEM_DS_FLAGS_VALID ) +/* ////////////////////////////////////////////////////////////////////////// */ typedef uint8_t opal_shmem_ds_flag_t; /* shared memory segment header */ @@ -119,35 +113,13 @@ struct opal_shmem_ds_t { int seg_id; /* size of shared memory segment */ size_t seg_size; + /* path to backing store */ + char seg_name[OPAL_PATH_MAX]; /* base address of shared memory segment */ unsigned char *seg_base_addr; - /* path to backing store -- last element so we can easily calculate the - * "real" size of opal_shmem_ds_t. that is, the amount of the struct that - * is actually being used. for example: if seg_name is something like: - * "foo_baz" and OPAL_PATH_MAX is 4096, we want to know that only a very - * limited amount of the seg_name buffer is actually being used. - */ - char seg_name[OPAL_PATH_MAX]; }; typedef struct opal_shmem_ds_t opal_shmem_ds_t; -/* ////////////////////////////////////////////////////////////////////////// */ -/** - * Simply returns the amount of used space. For use when sending the entire - * opal_shmem_ds_t payload isn't viable -- due to the potential disparity - * between the reserved buffer space and what is actually in use. - */ -static inline size_t -opal_shmem_sizeof_shmem_ds(const opal_shmem_ds_t *ds_bufp) -{ - char *name_base = NULL; - size_t name_buf_offset = offsetof(opal_shmem_ds_t, seg_name); - - name_base = (char *)ds_bufp + name_buf_offset; - - return name_buf_offset + strlen(name_base) + 1; -} - END_C_DECLS #endif /* OPAL_SHMEM_TYPES_H */ diff --git a/opal/mca/shmem/sysv/shmem_sysv_module.c b/opal/mca/shmem/sysv/shmem_sysv_module.c index 1d8389c938..c0d13f429f 100644 --- a/opal/mca/shmem/sysv/shmem_sysv_module.c +++ b/opal/mca/shmem/sysv/shmem_sysv_module.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010-2011 Los Alamos National Security, LLC. * All rights reserved. * * $COPYRIGHT$ @@ -113,12 +113,13 @@ opal_shmem_sysv_module_t opal_shmem_sysv_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { - /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting\n", + "%s: %s: shmem_ds_resetting " + "(id: %d, size: %lu, name: %s)\n", mca_shmem_sysv_component.super.base_version.mca_type_name, - mca_shmem_sysv_component.super.base_version.mca_component_name) + mca_shmem_sysv_component.super.base_version.mca_component_name, + ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); ds_buf->seg_cpid = 0; @@ -194,7 +195,7 @@ segment_create(opal_shmem_ds_t *ds_buf, * real_size here */ if (-1 == (ds_buf->seg_id = shmget(IPC_PRIVATE, real_size, - IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) { + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); diff --git a/opal/mca/shmem/windows/shmem_windows_module.c b/opal/mca/shmem/windows/shmem_windows_module.c index 1971627fbe..434906ff53 100644 --- a/opal/mca/shmem/windows/shmem_windows_module.c +++ b/opal/mca/shmem/windows/shmem_windows_module.c @@ -114,12 +114,13 @@ opal_shmem_windows_module_t opal_shmem_windows_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { - /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting\n", + "%s: %s: shmem_ds_resetting " + "(id: %d, size: %"PRIsize_t", name: %s)\n", mca_shmem_windows_component.super.base_version.mca_type_name, - mca_shmem_windows_component.super.base_version.mca_component_name) + mca_shmem_windows_component.super.base_version.mca_component_name, + ds_buf->seg_id, ds_buf->seg_size, ds_buf->seg_name) ); ds_buf->seg_cpid = 0;