diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 21c55e1e82..f07310dd57 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. + * Copyright (c) 2010-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. @@ -43,9 +43,12 @@ #include "opal/util/output.h" #include "opal/util/printf.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/mca/shmem/shmem.h" #include "orte/util/proc_info.h" #include "opal/datatype/opal_convertor.h" #include "ompi/class/ompi_free_list.h" +#include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/sm/mpool_sm.h" @@ -111,7 +114,6 @@ mca_btl_sm_t mca_btl_sm = { */ #define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE)) - static void *mpool_calloc(size_t nmemb, size_t size) { void *buf; @@ -127,17 +129,163 @@ static void *mpool_calloc(size_t nmemb, size_t size) return buf; } - -static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) +/* + * Returns a pointer to node rank zero. Returns NULL on error. + */ +static ompi_proc_t * +get_node_rank_zero_proc_ptr(ompi_proc_t **proc_world, + size_t proc_world_size) { - size_t size, length, length_payload; - char *sm_ctl_file; + size_t num_local_procs = 0; + + if (NULL == proc_world) { + return NULL; + } + /* sort the procs list and get a pointer to the lowest node rank */ + if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(proc_world, + proc_world_size, + &num_local_procs)) { + opal_output(0, "mca_common_sm_local_proc_reorder failure! " + "Cannot continue.\n"); + return NULL; + } + + return proc_world[0]; +} + +static int +do_segmented_modex_recv(mca_btl_sm_component_t *comp_ptr, + mca_btl_sm_modex_t **sm_modex_bufp) +{ + int member_id = 0, rc = OMPI_ERROR; + size_t segment_size = 0, member_offset = 0; + /* start with the full modex buffer size */ + size_t key_len = 0, proc_world_size = 0; + unsigned char *modex_bufp = NULL; + char *key = NULL, *modex_comp_name = NULL; + void *tmp_bufp = NULL; + ompi_proc_t **proc_world = NULL, *proc_node_rank_zero = NULL; + + if (NULL == (modex_bufp = calloc(1, sizeof(mca_btl_sm_modex_t)))) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + /* stash the base of the modex buffer, because modex_bufp gets modified */ + *sm_modex_bufp = (mca_btl_sm_modex_t *)modex_bufp; + if (NULL == (proc_world = ompi_proc_world(&proc_world_size))) { + opal_output(0, "ompi_proc_world failure! Cannot continue.\n"); + rc = OMPI_ERROR; + goto out; + } + if (NULL == (proc_node_rank_zero = + get_node_rank_zero_proc_ptr(proc_world, proc_world_size))) { + opal_output(0, "get_node_rank_zero_proc_ptr failure! " + "Cannot continue.\n"); + rc = OMPI_ERROR; + goto out; + } + if (NULL == (modex_comp_name = + mca_base_component_to_string(&comp_ptr->super.btl_version))) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + /* SM_MODEX_STR_PAD to accommodate the member id and key index */ + key_len = strlen(modex_comp_name) + SM_MODEX_STR_PAD; + if (NULL == (key = calloc(key_len, sizeof(*key)))) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + + /* iterate over all modex members and store their respective pieces */ + for (member_id = 0; member_id < SM_MODEX_NUM_MEMBERS; ++member_id) { + if (OMPI_SUCCESS != (rc = + mca_btl_sm_get_modex_member_off_n_size(*sm_modex_bufp, member_id, + &member_offset, NULL))) { + goto out; + } + if (member_id < 2) { /* for mids 0 and 1 */ + opal_shmem_ds_t *tmp_ds = calloc(1, sizeof(*tmp_ds)); + size_t path_offset = offsetof(opal_shmem_ds_t, seg_name); + char *tmp_pathp = NULL; + if (NULL == tmp_ds) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + (void)snprintf(key, key_len, "%s-%d-%d", + modex_comp_name, member_id, 0); + rc = ompi_modex_recv_string((const char *)key, proc_node_rank_zero, + &tmp_bufp, &segment_size); + (void)memmove(tmp_ds, tmp_bufp, segment_size); + free(tmp_bufp); + /* now copy the path stuff */ + (void)snprintf(key, key_len, "%s-%d-%d", + modex_comp_name, member_id, 1); + rc = ompi_modex_recv_key_value((const char *)key, proc_node_rank_zero, + &tmp_bufp, OPAL_STRING); + tmp_pathp = (char *)tmp_bufp; + (void)memmove((unsigned char *)tmp_ds + path_offset, + tmp_pathp, strlen(tmp_pathp) + 1); + modex_bufp = ((unsigned char *)*sm_modex_bufp) + member_offset; + (void)memmove(modex_bufp, tmp_ds, sizeof(*tmp_ds)); + free(tmp_ds); + free(tmp_bufp); + } + else { /* for mid 2 */ + (void)snprintf(key, key_len, "%s-%d", modex_comp_name, member_id); + rc = ompi_modex_recv_string((const char *)key, proc_node_rank_zero, + &tmp_bufp, &segment_size); + if (OMPI_SUCCESS != rc) { + /* rc is set */ + goto out; + } + modex_bufp = ((unsigned char *)*sm_modex_bufp) + member_offset; + (void)memmove(modex_bufp, tmp_bufp, segment_size); + free(tmp_bufp); + } + } + +out: + if (NULL != modex_comp_name) { + free(modex_comp_name); + } + if (NULL != key) { + free(key); + } + if (NULL != proc_world) { + free(proc_world); + } + if (OMPI_SUCCESS != rc && NULL != *sm_modex_bufp) { + free(*sm_modex_bufp); + *sm_modex_bufp = NULL; + } + return rc; +} +/* + * Modex receive. Caller is responsible for freeing returned resources. + */ +static inline int +recv_modex(mca_btl_sm_component_t *comp_ptr, + mca_btl_sm_modex_t **out_modex) +{ + int rc; + + if (OMPI_SUCCESS != (rc = do_segmented_modex_recv(comp_ptr, out_modex))) { + opal_output(0, "recv_modex: do_segmented_modex_recv failure!\n"); + } + return rc; +} + +static int +sm_btl_first_time_init(mca_btl_sm_t *sm_btl, + int32_t my_smp_rank, + int n) +{ + size_t length, length_payload; sm_fifo_t *my_fifos; - int my_mem_node, num_mem_nodes, i; - ompi_proc_t **procs; - size_t num_procs; - mca_mpool_base_resources_t res; + int my_mem_node, num_mem_nodes, i, rc; + mca_mpool_base_resources_t *res = NULL; mca_btl_sm_component_t* m = &mca_btl_sm_component; + mca_btl_sm_modex_t *modex = NULL; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; @@ -190,50 +338,43 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) } #endif - /* lookup shared memory pool */ - mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes, - sizeof(mca_mpool_base_module_t*)); - - /* Disable memory binding, because each MPI process will claim - pages in the mpool for their local NUMA node */ - res.mem_node = -1; - - /* determine how much memory to create */ - /* - * This heuristic formula mostly says that we request memory for: - * - nfifos FIFOs, each comprising: - * . a sm_fifo_t structure - * . many pointers (fifo_size of them per FIFO) - * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks) - * - max fragments (sm_free_list_num of them) - * - * On top of all that, we sprinkle in some number of - * "opal_cache_line_size" additions to account for some - * padding and edge effects that may lie in the allocator. - */ - res.size = - FIFO_MAP_NUM(n) * ( sizeof(sm_fifo_t) + sizeof(void *) * m->fifo_size + 4 * opal_cache_line_size ) - + ( 2 * n + m->sm_free_list_inc ) * ( m->eager_limit + 2 * opal_cache_line_size ) - + m->sm_free_list_num * ( m->max_frag_size + 2 * opal_cache_line_size ); - - /* before we multiply by n, make sure the result won't overflow */ - /* Stick that little pad in, particularly since we'll eventually - * need a little extra space. E.g., in mca_mpool_sm_init() in - * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is - * added. - */ - if ( ((double) res.size) * n > LONG_MAX - 4096 ) { + if (NULL == (res = calloc(1, sizeof(*res)))) { return OMPI_ERR_OUT_OF_RESOURCE; } - res.size *= n; - - /* now, create it */ + /* everyone receive modex information. all but node rank zero attach to the + * segments stored within the modex. remember: node rank zero is already + * attached to sm_seg. */ + if (OMPI_SUCCESS != (rc = recv_modex(m, &modex))) { + free(res); + return rc; + } + /* lookup shared memory pool */ + mca_btl_sm_component.sm_mpools = + (mca_mpool_base_module_t **)calloc(num_mem_nodes, + sizeof(mca_mpool_base_module_t *)); + + /* Disable memory binding, because each MPI process will claim pages in the + * mpool for their local NUMA node */ + res->mem_node = -1; + res->size = modex->mpool_res_size; + + /* copy mpool's modex info into its base resources */ + if (OPAL_SUCCESS != + opal_shmem_ds_copy(&(modex->sm_mpool_meta_buf), + &(res->bs_meta_buf))) { + free(res); + free(modex); + return OMPI_ERROR; + } + /* now that res is fully populated, create the thing */ mca_btl_sm_component.sm_mpools[0] = mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, - sm_btl, &res); + sm_btl, res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_sm_component.sm_mpools[0]) { - return OMPI_ERR_OUT_OF_RESOURCE; + free(res); + free(modex); + return OMPI_ERR_OUT_OF_RESOURCE; } mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0]; @@ -245,37 +386,27 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**) calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == mca_btl_sm_component.sm_peers) { + free(res); + free(modex); return OMPI_ERR_OUT_OF_RESOURCE; } - - /* Allocate Shared Memory BTL process coordination - * data structure. This will reside in shared memory */ - - /* set file name */ - if (asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", - orte_process_info.job_session_dir, - orte_process_info.nodename) < 0) { - return OMPI_ERR_OUT_OF_RESOURCE; + if (0 != my_smp_rank) { + if (NULL == (mca_btl_sm_component.sm_seg = + mca_common_sm_module_attach(&modex->sm_meta_buf, + sizeof(mca_common_sm_seg_header_t), + opal_cache_line_size))) { + /* don't have to detach here, because module_attach cleans up after + * itself on failure. */ + opal_output(0, "sm_btl_first_time_init: " + "mca_common_sm_module_attach failure!\n"); + free(modex); + free(res); + return OMPI_ERROR; + } } - - /* Pass in a data segment alignment of 0 to get no data - segment (only the shared control structure) */ - size = sizeof(mca_common_sm_seg_header_t) + - n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size; - procs = ompi_proc_world(&num_procs); - if (!(mca_btl_sm_component.sm_seg = - mca_common_sm_init(procs, num_procs, size, sm_ctl_file, - sizeof(mca_common_sm_seg_header_t), - opal_cache_line_size))) { - opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory " - "BTL coordinating strucure :: size %lu \n", - (unsigned long)size); - free(procs); - free(sm_ctl_file); - return OMPI_ERROR; - } - free(procs); - free(sm_ctl_file); + /* it is now safe to free the modex and the mpool resources */ + free(modex); + free(res); /* check to make sure number of local procs is within the * specified limits */ @@ -374,6 +505,7 @@ static struct mca_btl_base_endpoint_t * create_sm_endpoint(int local_proc, struct ompi_proc_t *proc) { struct mca_btl_base_endpoint_t *ep; + #if OMPI_ENABLE_PROGRESS_THREADS == 1 char path[PATH_MAX]; #endif @@ -401,22 +533,6 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc) return ep; } -static void calc_sm_max_procs(int n) -{ - /* see if need to allocate space for extra procs */ - if(0 > mca_btl_sm_component.sm_max_procs) { - /* no limit */ - if(0 <= mca_btl_sm_component.sm_extra_procs) { - /* limit */ - mca_btl_sm_component.sm_max_procs = - n + mca_btl_sm_component.sm_extra_procs; - } else { - /* no limit */ - mca_btl_sm_component.sm_max_procs = 2 * n; - } - } -} - int mca_btl_sm_add_procs( struct mca_btl_base_module_t* btl, size_t nprocs, @@ -430,6 +546,9 @@ int mca_btl_sm_add_procs( mca_btl_sm_t *sm_btl; bool have_connected_peer = false; char **bases; + /* for easy access to the mpool_sm_module */ + mca_mpool_sm_module_t *sm_mpool_modp = NULL; + /* initializion */ sm_btl = (mca_btl_sm_t *)btl; @@ -442,7 +561,7 @@ int mca_btl_sm_add_procs( * and idetify procs that are on this host. Add procs on this * host to shared memory reachbility list. Also, get number * of local procs in the procs list. */ - for(proc = 0; proc < (int32_t)nprocs; proc++) { + for (proc = 0; proc < (int32_t)nprocs; proc++) { /* check to see if this proc can be reached via shmem (i.e., if they're on my local host and in my job) */ if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid || @@ -477,18 +596,18 @@ int mca_btl_sm_add_procs( goto CLEANUP; /* make sure that my_smp_rank has been defined */ - if(-1 == my_smp_rank) { + if (-1 == my_smp_rank) { return_code = OMPI_ERROR; goto CLEANUP; } - calc_sm_max_procs(n_local_procs); - if (!sm_btl->btl_inited) { return_code = - sm_btl_first_time_init(sm_btl, mca_btl_sm_component.sm_max_procs); - if(return_code != OMPI_SUCCESS) + sm_btl_first_time_init(sm_btl, my_smp_rank, + mca_btl_sm_component.sm_max_procs); + if (return_code != OMPI_SUCCESS) { goto CLEANUP; + } } /* set local proc's smp rank in the peers structure for @@ -501,6 +620,7 @@ int mca_btl_sm_add_procs( } bases = mca_btl_sm_component.shm_bases; + sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool; /* initialize own FIFOs */ /* @@ -524,13 +644,34 @@ int mca_btl_sm_add_procs( /* Sync with other local procs. Force the FIFO initialization to always * happens before the readers access it. */ - opal_atomic_add_32( &mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); + opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); while( n_local_procs > mca_btl_sm_component.sm_seg->module_seg->seg_inited) { opal_progress(); opal_atomic_rmb(); } + /* it is now safe to unlink the shared memory segment. only one process + * needs to do this, so just let smp rank zero take care of it. */ + if (0 == my_smp_rank) { + if (OMPI_SUCCESS != + mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) { + /* it is "okay" if this fails at this point. we have gone this far, + * so just warn about the failure and continue. this is probably + * only triggered by a programming error. */ + opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); + } + /* SKG - another abstraction violation here, but I don't want to add + * extra code in the sm mpool for further synchronization. */ + + /* at this point, all processes have attached to the mpool segment. so + * it is safe to unlink it here. */ + if (OMPI_SUCCESS != + mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) { + opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); + } + } + /* coordinate with other processes */ for(j = mca_btl_sm_component.num_smp_procs; j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) { diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index 2a02f543b4..07be4cd53c 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -42,6 +42,8 @@ #include "opal/util/bit_ops.h" #include "opal/class/opal_free_list.h" +#include "opal/mca/shmem/shmem.h" + #include "ompi/mca/btl/btl.h" #include "ompi/mca/common/sm/common_sm.h" @@ -83,6 +85,10 @@ BEGIN_C_DECLS line that should hopefully be good in most places. */ #define SM_CACHE_LINE_PAD 128 +/* number of members in mca_btl_sm_modex_t */ +#define SM_MODEX_NUM_MEMBERS 3 +#define SM_MODEX_STR_PAD 32 + struct sm_fifo_t { /* This queue pointer is used only by the heads. */ volatile void **queue; @@ -121,6 +127,58 @@ typedef struct mca_btl_sm_mem_node_t { mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */ } mca_btl_sm_mem_node_t; +/** + * Shared Memory (SM) BTL modex. + * Please update SM_MODEX_NUM_MEMBERS if the number of members ever changes. + */ +struct mca_btl_sm_modex_t { + /* 0 */ + opal_shmem_ds_t sm_meta_buf; + /* 1 */ + opal_shmem_ds_t sm_mpool_meta_buf; + /* 2 */ + size_t mpool_res_size; +}; +typedef struct mca_btl_sm_modex_t mca_btl_sm_modex_t; + +static inline int +mca_btl_sm_get_modex_member_off_n_size(const mca_btl_sm_modex_t *bp, + int mid, size_t *out_off, + size_t *out_size) { + switch (mid) { + /* sm_meta_buf */ + case 0: + if (NULL != out_off) { + *out_off = offsetof(mca_btl_sm_modex_t, sm_meta_buf); + } + if (NULL != out_size) { + *out_size = opal_shmem_sizeof_shmem_ds(&bp->sm_meta_buf); + } + break; + /* sm_mpool_meta_buf */ + case 1: + if (NULL != out_off) { + *out_off = offsetof(mca_btl_sm_modex_t, sm_mpool_meta_buf); + } + if (NULL != out_size) { + *out_size = opal_shmem_sizeof_shmem_ds(&bp->sm_mpool_meta_buf); + } + break; + case 2: + /* mpool_res_size */ + if (NULL != out_off) { + *out_off = offsetof(mca_btl_sm_modex_t, mpool_res_size); + } + if (NULL != out_size) { + *out_size = sizeof(bp->mpool_res_size); + } + break; + default: + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; + } + return OMPI_SUCCESS; +} + /** * Shared Memory (SM) BTL module. */ diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index 9b66830c5a..20d06cedb7 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. @@ -42,14 +42,18 @@ #include /* for mkfifo */ #endif /* HAVE_SYS_STAT_H */ -#include "ompi/constants.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/mca/shmem/shmem.h" #include "opal/util/bit_ops.h" #include "opal/util/output.h" -#include "orte/util/proc_info.h" + #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/proc_info.h" -#include "opal/mca/base/mca_base_param.h" +#include "ompi/constants.h" +#include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/btl/base/btl_base_error.h" @@ -356,52 +360,450 @@ CLEANUP: return return_value; } +/* + * Returns the number of processes on the node. + */ +static inline int +get_num_local_procs(void) +{ + /* num_local_peers does not include us in + * its calculation, so adjust for that */ + return (int)(1 + orte_process_info.num_local_peers); +} + +static void +calc_sm_max_procs(int n) +{ + /* see if need to allocate space for extra procs */ + if (0 > mca_btl_sm_component.sm_max_procs) { + /* no limit */ + if (0 <= mca_btl_sm_component.sm_extra_procs) { + /* limit */ + mca_btl_sm_component.sm_max_procs = + n + mca_btl_sm_component.sm_extra_procs; + } else { + /* no limit */ + mca_btl_sm_component.sm_max_procs = 2 * n; + } + } +} + +static int +create_and_attach(mca_btl_sm_component_t *comp_ptr, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment, + mca_common_sm_module_t **out_modp) + +{ + if (NULL == (*out_modp = + mca_common_sm_module_create_and_attach(size, file_name, + size_ctl_structure, + data_seg_alignment))) { + opal_output(0, "create_and_attach: unable to create shared memory " + "BTL coordinating strucure :: size %lu \n", + (unsigned long)size); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +/* + * SKG - I'm not happy with this, but I can't figure out a better way of + * finding the sm mpool's minimum size 8-|. The way I see it. This BTL only + * uses the sm mpool, so maybe this isn't so bad... + * + * The problem is the we need to size the mpool resources at sm BTL component + * init. That means we need to know the mpool's minimum size at create. + */ +static int +get_min_mpool_size(mca_btl_sm_component_t *comp_ptr, + size_t *out_size) +{ + char *type_name = "mpool"; + char *param_name = "min_size"; + char *min_size = NULL; + int id = 0; + size_t default_min = 67108864; + size_t size = 0; + long tmp_size = 0; + + if (0 > (id = mca_base_param_find(type_name, comp_ptr->sm_mpool_name, + param_name))) { + opal_output(0, "mca_base_param_find: failure looking for %s_%s_%s\n", + type_name, comp_ptr->sm_mpool_name, param_name); + return OMPI_ERR_NOT_FOUND; + } + if (OPAL_ERROR == mca_base_param_lookup_string(id, &min_size)) { + opal_output(0, "mca_base_param_lookup_string failure\n"); + return OMPI_ERROR; + } + errno = 0; + tmp_size = strtol(min_size, (char **)NULL, 10); + if (ERANGE == errno || EINVAL == errno || tmp_size <= 0) { + opal_output(0, "mca_btl_sm::get_min_mpool_size: " + "Unusable %s_%s_min_size provided. " + "Continuing with %lu.", type_name, + comp_ptr->sm_mpool_name, + (unsigned long)default_min); + + size = default_min; + } + else { + size = (size_t)tmp_size; + } + free(min_size); + *out_size = size; + return OMPI_SUCCESS; +} + +static int +get_mpool_res_size(int32_t max_procs, + size_t *out_res_size) +{ + size_t size = 0; + /* determine how much memory to create */ + /* + * This heuristic formula mostly says that we request memory for: + * - nfifos FIFOs, each comprising: + * . a sm_fifo_t structure + * . many pointers (fifo_size of them per FIFO) + * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks) + * - max fragments (sm_free_list_num of them) + * + * On top of all that, we sprinkle in some number of + * "opal_cache_line_size" additions to account for some + * padding and edge effects that may lie in the allocator. + */ + size = FIFO_MAP_NUM(max_procs) * + (sizeof(sm_fifo_t) + sizeof(void *) * + mca_btl_sm_component.fifo_size + 4 * opal_cache_line_size) + + (2 * max_procs + mca_btl_sm_component.sm_free_list_inc) * + (mca_btl_sm_component.eager_limit + 2 * opal_cache_line_size) + + mca_btl_sm_component.sm_free_list_num * + (mca_btl_sm_component.max_frag_size + 2 * opal_cache_line_size); + + /* add something for the control structure */ + size += sizeof(mca_common_sm_module_t); + + /* before we multiply by max_procs, make sure the result won't overflow */ + /* Stick that little pad in, particularly since we'll eventually + * need a little extra space. E.g., in mca_mpool_sm_init() in + * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is + * added. + */ + if (((double)size) * max_procs > LONG_MAX - 4096) { + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; + } + size *= (size_t)max_procs; + *out_res_size = size; + return OMPI_SUCCESS; +} + +/* + * Creates the shared-memory segments required for this BTL. One for the sm + * mpool and another for the shared memory store and populates *modex_buf_ptr. + * + * it is assumed that calc_sm_max_procs has already been called (sets + * sm_max_procs). + */ +static int +populate_modex_bufp(mca_btl_sm_component_t *comp_ptr, + mca_btl_sm_modex_t *modex_buf_ptr) +{ + int rc = OMPI_SUCCESS; + size_t size = 0; + size_t min_size = 0; + char *sm_mpool_ctl_file = NULL; + char *sm_ctl_file = NULL; + /* used as a temporary store so we can extract shmem_ds info */ + mca_common_sm_module_t *tmp_modp = NULL; + + /* first generate some unique paths for the shared-memory segments that + * this BTL needs. */ + if (asprintf(&sm_mpool_ctl_file, + "%s"OPAL_PATH_SEP"shared_mem_pool.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename) < 0) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + if (asprintf(&sm_ctl_file, + "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename) < 0) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + + /* create the things */ + + /* === sm mpool === */ + /* get the segment size for the sm mpool. */ + if (OMPI_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs, + &size))) { + /* rc is already set */ + goto out; + } + /* do we need to update the size based on the sm mpool's min size? */ + if (OMPI_SUCCESS != (rc = get_min_mpool_size(comp_ptr, &min_size))) { + goto out; + } + if (size < min_size) { + size = min_size; + } + /* we only need the shmem_ds info at this point. initilization will be + * completed in the mpool module code. the idea is that we just need this + * info so we can populate the modex. */ + if (OMPI_SUCCESS != (rc = + create_and_attach(comp_ptr, size, sm_mpool_ctl_file, + sizeof(mca_common_sm_module_t), 8, &tmp_modp))) { + /* rc is set */ + goto out; + } + /* now extract and store the shmem_ds info from the returned module */ + if (OPAL_SUCCESS != + opal_shmem_ds_copy(&(tmp_modp->shmem_ds), + &(modex_buf_ptr->sm_mpool_meta_buf))) { + rc = OMPI_ERROR; + goto out; + } + /* set the mpool_res_size in the modex */ + modex_buf_ptr->mpool_res_size = size; + + /* === sm btl === */ + /* calculate the segment size. */ + size = sizeof(mca_common_sm_seg_header_t) + + comp_ptr->sm_max_procs * + (sizeof(sm_fifo_t *) + + sizeof(char *) + sizeof(uint16_t)) + + opal_cache_line_size; + + if (OMPI_SUCCESS != (rc = + create_and_attach(comp_ptr, size, sm_ctl_file, + sizeof(mca_common_sm_seg_header_t), + opal_cache_line_size, &comp_ptr->sm_seg))) { + /* rc is set */ + goto out; + } + /* now extract and store the shmem_ds info from the returned module */ + if (OPAL_SUCCESS != opal_shmem_ds_copy(&(comp_ptr->sm_seg->shmem_ds), + &(modex_buf_ptr->sm_meta_buf))) { + rc = OMPI_ERROR; + goto out; + } + +out: + if (NULL != sm_mpool_ctl_file) { + free(sm_mpool_ctl_file); + } + if (NULL != sm_ctl_file) { + free(sm_ctl_file); + } + return rc; +} + +static int +send_member(char *key_prefix, + unsigned char *member_basep, + size_t extent, + int member_id) +{ + char *key = NULL; + int rc = OMPI_ERROR; + size_t shmem_path_offset = 0; + + switch (member_id) { + case 0: + case 1: + shmem_path_offset = offsetof(opal_shmem_ds_t, seg_name); + if (-1 == asprintf(&key, "%s-%d", key_prefix, 0)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + if (OMPI_SUCCESS != (rc = + ompi_modex_send_string((const char *)key, + member_basep, shmem_path_offset))) { + free(key); + return rc; + } + free(key); + if (-1 == asprintf(&key, "%s-%d", key_prefix, 1)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* using ompi_modex_send_key_value here, so the data isn't encoded + * if using PMI grpcomm. */ + if (OMPI_SUCCESS != (rc = + ompi_modex_send_key_value(key, + (member_basep + shmem_path_offset), + OPAL_STRING))) { + free(key); + return rc; + } + free(key); + return OMPI_SUCCESS; + case 2: + if (OMPI_SUCCESS != (rc = + ompi_modex_send_string((const char *)key_prefix, + member_basep, extent))) { + free(key); + return rc; + } + return OMPI_SUCCESS; + default: + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; + } + return OMPI_ERROR; +} + +static int +send_all_modex_members(mca_btl_sm_component_t *comp_ptr, + mca_btl_sm_modex_t *modex_bufp) +{ + size_t offset = 0, extent = 0; + unsigned char *datap = (unsigned char *)modex_bufp; + unsigned char *tmp_base = NULL; + char *modex_comp_name = NULL; + int rc, mid; + char *key; + + if (NULL == (modex_comp_name = + mca_base_component_to_string(&comp_ptr->super.btl_version))) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* iterate over all the modex members and pack the data into one message + * buffer */ + for (mid = 0; mid < SM_MODEX_NUM_MEMBERS; ++mid) { + if (OMPI_SUCCESS != (rc = + mca_btl_sm_get_modex_member_off_n_size(modex_bufp, mid, + &offset, &extent))) { + /* rc is set */ + goto out; + } + tmp_base = (unsigned char *)datap + offset; + if (-1 == asprintf(&key, "%s-%d", modex_comp_name, mid)) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + if (OMPI_SUCCESS != (rc = send_member(key, tmp_base, extent, mid))) { + free(key); + goto out; + } + free(key); + } + +out: + if (NULL != modex_comp_name) { + free(modex_comp_name); + } + return rc; +} + +/* + * Creates information required for the sm modex and modex sends it. + */ +static int +send_modex(mca_btl_sm_component_t *comp_ptr, + orte_node_rank_t node_rank) +{ + int rc = OMPI_SUCCESS; + mca_btl_sm_modex_t *sm_modex = NULL; + + /* only node rank zero needs to send modex info */ + if (0 != node_rank) { + return OMPI_SUCCESS; + } + if (NULL == (sm_modex = calloc(1, sizeof(*sm_modex)))) { + /* out of resources, so just bail. */ + return OMPI_ERR_OUT_OF_RESOURCE; + } + if (OMPI_SUCCESS != (rc = populate_modex_bufp(comp_ptr, sm_modex))) { + opal_output(0, "send_modex: populate_modex_bufp failure!\n"); + /* rc is set */ + goto out; + } + rc = send_all_modex_members(comp_ptr, sm_modex); + +out: + if (NULL != sm_modex) { + free(sm_modex); + } + return rc; +} + /* * SM component initialization */ -static mca_btl_base_module_t** mca_btl_sm_component_init( - int *num_btls, - bool enable_progress_threads, - bool enable_mpi_threads) +static mca_btl_base_module_t ** +mca_btl_sm_component_init(int *num_btls, + bool enable_progress_threads, + bool enable_mpi_threads) { + int num_local_procs = 0; mca_btl_base_module_t **btls = NULL; + orte_node_rank_t my_node_rank = ORTE_NODE_RANK_INVALID; #if OMPI_BTL_SM_HAVE_KNEM int rc; #endif *num_btls = 0; - - /* if no session directory was created, then we cannot be used */ - if (!orte_create_session_dirs) { - return NULL; - } - /* lookup/create shared memory pool only when used */ mca_btl_sm_component.sm_mpool = NULL; mca_btl_sm_component.sm_mpool_base = NULL; -#if OMPI_ENABLE_PROGRESS_THREADS == 1 - /* create a named pipe to receive events */ - sprintf( mca_btl_sm_component.sm_fifo_path, - "%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir, - (unsigned long)ORTE_PROC_MY_NAME->vpid ); - if(mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) { - opal_output(0, "mca_btl_sm_component_init: mkfifo failed with errno=%d\n",errno); + /* if no session directory was created, then we cannot be used */ + /* SKG - this isn't true anymore. Some backing facilities don't require a + * file-backed store. Extend shmem to provide this info one day. */ + if (!orte_create_session_dirs) { return NULL; } - mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, O_RDWR); + /* if we don't have locality information, then we cannot be used */ + if (ORTE_NODE_RANK_INVALID == + (my_node_rank = orte_process_info.my_node_rank)) { + orte_show_help("help-mpi-btl-sm.txt", "no locality", true); + return NULL; + } + /* no use trying to use sm with less than two procs, so just bail. */ + if ((num_local_procs = get_num_local_procs()) < 2) { + return NULL; + } + /* calculate max procs so we can figure out how large to make the + * shared-memory segment. this routine sets component sm_max_procs. */ + calc_sm_max_procs(num_local_procs); + + if (OMPI_SUCCESS != send_modex(&mca_btl_sm_component, my_node_rank)) { + return NULL; + } + +#if OMPI_ENABLE_PROGRESS_THREADS == 1 + /* create a named pipe to receive events */ + sprintf(mca_btl_sm_component.sm_fifo_path, + "%s"OPAL_PATH_SEP"sm_fifo.%lu", + orte_process_info.job_session_dir, + (unsigned long)ORTE_PROC_MY_NAME->vpid); + if (mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) { + opal_output(0, "mca_btl_sm_component_init: " + "mkfifo failed with errno=%d\n",errno); + return NULL; + } + mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, + O_RDWR); if(mca_btl_sm_component.sm_fifo_fd < 0) { - opal_output(0, "mca_btl_sm_component_init: open(%s) failed with errno=%d\n", + opal_output(0, "mca_btl_sm_component_init: " + "open(%s) failed with errno=%d\n", mca_btl_sm_component.sm_fifo_path, errno); return NULL; } OBJ_CONSTRUCT(&mca_btl_sm_component.sm_fifo_thread, opal_thread_t); - mca_btl_sm_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_sm_component_event_thread; + mca_btl_sm_component.sm_fifo_thread.t_run = + (opal_thread_fn_t)mca_btl_sm_component_event_thread; opal_thread_start(&mca_btl_sm_component.sm_fifo_thread); #endif - mca_btl_sm_component.sm_btls = (mca_btl_sm_t **) malloc( mca_btl_sm_component.sm_max_btls * sizeof (mca_btl_sm_t *)); + mca_btl_sm_component.sm_btls = + (mca_btl_sm_t **)malloc(mca_btl_sm_component.sm_max_btls * + sizeof(mca_btl_sm_t *)); if (NULL == mca_btl_sm_component.sm_btls) { return NULL; } diff --git a/ompi/mca/btl/sm/help-mpi-btl-sm.txt b/ompi/mca/btl/sm/help-mpi-btl-sm.txt index b6905097d6..9d868e8445 100644 --- a/ompi/mca/btl/sm/help-mpi-btl-sm.txt +++ b/ompi/mca/btl/sm/help-mpi-btl-sm.txt @@ -4,6 +4,8 @@ # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -12,6 +14,10 @@ # # This is the US/English help file for Open MPI's shared memory support. # +[no locality] +WARNING: Missing locality information required for sm initialization. +Continuing without shared memory support. +# [knem requested but not supported] WARNING: Linux kernel knem support was requested for the shared memory (sm) BTL, but it is not supported. Deactivating the shared memory diff --git a/ompi/mca/common/sm/common_sm.c b/ompi/mca/common/sm/common_sm.c index 889d9c3faa..74dfdead43 100644 --- a/ompi/mca/common/sm/common_sm.c +++ b/ompi/mca/common/sm/common_sm.c @@ -42,6 +42,7 @@ #include "opal/align.h" #include "opal/util/argv.h" +#include "opal/mca/shmem/shmem.h" #if OPAL_ENABLE_FT_CR == 1 #include "opal/runtime/opal_cr.h" #endif @@ -133,7 +134,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, map->module_data_addr = addr; map->module_seg_addr = (unsigned char *)seg; - + /* note that size is only used during the first call */ if (first_call) { /* initialize some segment information */ @@ -157,20 +158,20 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, } /* ////////////////////////////////////////////////////////////////////////// */ -/* api implementation */ +/* api implementation */ /* ////////////////////////////////////////////////////////////////////////// */ /* ////////////////////////////////////////////////////////////////////////// */ mca_common_sm_module_t * -mca_common_sm_module_create(size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +mca_common_sm_module_create_and_attach(size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) { mca_common_sm_module_t *map = NULL; opal_shmem_ds_t *seg_meta = NULL; - if (NULL == (seg_meta = (opal_shmem_ds_t *) malloc(sizeof(*seg_meta)))) { + if (NULL == (seg_meta = (opal_shmem_ds_t *)malloc(sizeof(*seg_meta)))) { /* out of resources */ return NULL; } @@ -197,33 +198,39 @@ mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta, size_t size_ctl_structure, size_t data_seg_alignment) { - mca_common_sm_module_t *map = NULL; - /* notice that size is 0 here. it really doesn't matter because size WILL * NOT be used because this is an attach (first_call is false). */ - map = attach_and_init(seg_meta, 0, size_ctl_structure, - data_seg_alignment, false); - - return map; + return attach_and_init(seg_meta, 0, size_ctl_structure, + data_seg_alignment, false); } /* ////////////////////////////////////////////////////////////////////////// */ -mca_common_sm_module_t * -mca_common_sm_init(ompi_proc_t **procs, - size_t num_procs, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +int +mca_common_sm_module_unlink(mca_common_sm_module_t *modp) { - /* indicates whether or not i'm the lowest named process */ - bool lowest_local_proc = false; - mca_common_sm_module_t *map = NULL; - ompi_proc_t *temp_proc = NULL; - bool found_lowest = false; - size_t num_local_procs = 0, p = 0; - opal_shmem_ds_t *seg_meta = NULL; + if (NULL == modp) { + return OMPI_ERROR; + } + if (OPAL_SUCCESS != opal_shmem_unlink(&modp->shmem_ds)) { + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} +/* ////////////////////////////////////////////////////////////////////////// */ +int +mca_common_sm_local_proc_reorder(ompi_proc_t **procs, + size_t num_procs, + size_t *out_num_local_procs) +{ + size_t num_local_procs = 0; + bool found_lowest = false; + ompi_proc_t *temp_proc = NULL; + size_t p; + + if (NULL == out_num_local_procs || NULL == procs) { + return OMPI_ERR_BAD_PARAM; + } /* o reorder procs array to have all the local procs at the beginning. * o look for the local proc with the lowest name. * o determine the number of local procs. @@ -240,8 +247,7 @@ mca_common_sm_init(ompi_proc_t **procs, /* save this proc */ procs[num_local_procs] = procs[p]; /* if we have a new lowest, swap it with position 0 - * so that procs[0] is always the lowest named proc - */ + * so that procs[0] is always the lowest named proc */ if (OPAL_VALUE2_GREATER == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(procs[p]->proc_name), @@ -257,6 +263,31 @@ mca_common_sm_init(ompi_proc_t **procs, ++num_local_procs; } } + *out_num_local_procs = num_local_procs; + + return OMPI_SUCCESS; +} + +/* ////////////////////////////////////////////////////////////////////////// */ +mca_common_sm_module_t * +mca_common_sm_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + /* indicates whether or not i'm the lowest named process */ + bool lowest_local_proc = false; + mca_common_sm_module_t *map = NULL; + size_t num_local_procs = 0; + opal_shmem_ds_t *seg_meta = NULL; + + if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(procs, + num_procs, + &num_local_procs)) { + return NULL; + } /* if there is less than 2 local processes, there's nothing to do. */ if (num_local_procs < 2) { @@ -270,9 +301,9 @@ mca_common_sm_init(ompi_proc_t **procs, /* determine whether or not i am the lowest local process */ lowest_local_proc = - (0 == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, - ORTE_PROC_MY_NAME, - &(procs[0]->proc_name))); + (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, + ORTE_PROC_MY_NAME, + &(procs[0]->proc_name))); /* figure out if i am the lowest rank in the group. * if so, i will create the shared memory backing store @@ -434,4 +465,3 @@ mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module) } return rc; } - diff --git a/ompi/mca/common/sm/common_sm.h b/ompi/mca/common/sm/common_sm.h index b8fd007e1e..c916cc603c 100644 --- a/ompi/mca/common/sm/common_sm.h +++ b/ompi/mca/common/sm/common_sm.h @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -73,18 +73,31 @@ typedef struct mca_common_sm_module_t { OBJ_CLASS_DECLARATION(mca_common_sm_module_t); /** - * This routine is used to create a shared memory segment (whether - * it's an mmaped file or a SYSV IPC segment). It is assumed that + * This routine reorders procs array to have all the local procs at the + * beginning and returns the number of local procs through out_num_local_procs. + * The proc with the lowest name is at the beginning of the reordered procs + * array. + * + * @returnvalue OMPI_SUCCESS on success, something else, otherwise. + */ +OMPI_DECLSPEC extern int +mca_common_sm_local_proc_reorder(ompi_proc_t **procs, + size_t num_procs, + size_t *out_num_local_procs); + +/** + * This routine is used to create and attach to a shared memory segment + * (whether it's an mmaped file or a SYSV IPC segment). It is assumed that * the shared memory segment does not exist before this call. * * @returnvalue pointer to control structure at head of shared memory segment. * Returns NULL if an error occurred. */ -mca_common_sm_module_t * -mca_common_sm_module_create(size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment); +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_module_create_and_attach(size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); /** * This routine is used to attach to the shared memory segment associated with @@ -96,11 +109,22 @@ mca_common_sm_module_create(size_t size, * @returnvalue pointer to control structure at head of shared memory segment. * Returns NULL if an error occurred. */ -mca_common_sm_module_t * +OMPI_DECLSPEC extern mca_common_sm_module_t * mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta, size_t size_ctl_structure, size_t data_seg_alignment); +/** + * A thin wrapper around opal_shmem_unlink. + * + * @ modp points to an initialized mca_common_sm_module_t. + * + * @returnvalue OMPI_SUCCESS if the operation completed successfully, + * OMPI_ERROR otherwise. + */ +OMPI_DECLSPEC extern int +mca_common_sm_module_unlink(mca_common_sm_module_t *modp); + /** * This routine is used to set up a shared memory segment (whether * it's an mmaped file or a SYSV IPC segment). It is assumed that @@ -164,7 +188,7 @@ mca_common_sm_init_group(ompi_group_t *group, */ OMPI_DECLSPEC extern void * mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, - size_t* size, + size_t *size, mca_mpool_base_registration_t **registration); /** @@ -189,4 +213,3 @@ OMPI_DECLSPEC extern mca_common_sm_module_t *mca_common_sm_module; END_C_DECLS #endif /* _COMMON_SM_H_ */ - diff --git a/ompi/mca/mpool/sm/mpool_sm.h b/ompi/mca/mpool/sm/mpool_sm.h index b46bc044d5..9666b3b63f 100644 --- a/ompi/mca/mpool/sm/mpool_sm.h +++ b/ompi/mca/mpool/sm/mpool_sm.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -28,6 +28,7 @@ #include "ompi_config.h" #include "opal/mca/event/event.h" +#include "opal/mca/shmem/shmem.h" #include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/mpool/mpool.h" @@ -36,17 +37,19 @@ BEGIN_C_DECLS struct mca_mpool_sm_component_t { - mca_mpool_base_component_t super; - /* mca_allocator_base_module_t* sm_allocator; */ - char* sm_allocator_name; - int verbose; - /* struct mca_mpool_sm_mmap_t *sm_mmap; */ + mca_mpool_base_component_t super; + /* mca_allocator_base_module_t* sm_allocator; */ + char *sm_allocator_name; + int verbose; + /* struct mca_mpool_sm_mmap_t *sm_mmap; */ }; typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t; typedef struct mca_mpool_base_resources_t { size_t size; int32_t mem_node; + /* backing store metadata */ + opal_shmem_ds_t bs_meta_buf; } mca_mpool_base_resources_t; OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; @@ -54,7 +57,7 @@ OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; typedef struct mca_mpool_sm_module_t { mca_mpool_base_module_t super; long sm_size; - mca_allocator_base_module_t * sm_allocator; + mca_allocator_base_module_t *sm_allocator; struct mca_mpool_sm_mmap_t *sm_mmap; mca_common_sm_module_t *sm_common_module; int32_t mem_node; diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index bccaf78e38..e5f9ff3994 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -45,10 +45,14 @@ /* * Local functions */ -static int mca_mpool_sm_open(void); -static int mca_mpool_sm_close( void ); -static mca_mpool_base_module_t* mca_mpool_sm_init( - struct mca_mpool_base_resources_t* resources); +static int +mca_mpool_sm_open(void); + +static int +mca_mpool_sm_close(void); + +static mca_mpool_base_module_t * +mca_mpool_sm_init(struct mca_mpool_base_resources_t* resources); mca_mpool_sm_component_t mca_mpool_sm_component = { { @@ -90,8 +94,8 @@ static int mca_mpool_sm_open(void) /* register SM component parameters */ mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "allocator", - "Name of allocator component to use with sm mpool", - false, false, + "Name of allocator component " + "to use with sm mpool", false, false, "bucket", &mca_mpool_sm_component.sm_allocator_name); @@ -100,18 +104,18 @@ static int mca_mpool_sm_open(void) * to be set up to 2GB-1 for 32 bit and much greater for 64 bit. */ asprintf(&size_str, "%ld", default_min); mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, - "min_size", - "Minimum size of the sm mpool shared memory file", - false, false, size_str, &min_size_param); + "min_size", + "Minimum size of the sm mpool shared memory file", + false, false, size_str, &min_size_param); free(size_str); mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, - "verbose", - "Enable verbose output for mpool sm component", - false, false, 0, &value); + "verbose", + "Enable verbose output for mpool sm component", + false, false, 0, &value); if (value != 0) { - mca_mpool_sm_component.verbose = opal_output_open(NULL); + mca_mpool_sm_component.verbose = opal_output_open(NULL); } else { - mca_mpool_sm_component.verbose = -1; + mca_mpool_sm_component.verbose = -1; } return OMPI_SUCCESS; @@ -128,41 +132,44 @@ static int mca_mpool_sm_close( void ) return OMPI_SUCCESS; } -static mca_mpool_base_module_t* mca_mpool_sm_init( - struct mca_mpool_base_resources_t* resources) +static mca_mpool_base_module_t * +mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) { - char *file_name; - int len; - mca_mpool_sm_module_t* mpool_module; + mca_mpool_sm_module_t *mpool_module; mca_allocator_base_component_t* allocator_component; long min_size; ompi_proc_t **procs; size_t num_all_procs, i, num_local_procs = 0; /* README: this needs to change if procs in different jobs (even - spawned ones) are to talk using shared memory */ - procs = ompi_proc_world(&num_all_procs); + * spawned ones) are to talk using shared memory */ + if (NULL == (procs = ompi_proc_world(&num_all_procs))) { + /* out of resources, so just bail */ + return NULL; + } for (i = 0 ; i < num_all_procs ; ++i) { if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { num_local_procs++; } } - /* parse the min size and validate it */ - /* if other parameters are added, absolutely necessary to reset errno each time */ + /* if other parameters are added, absolutely + * necessary to reset errno each time */ errno = 0; min_size = strtol(min_size_param, (char **)NULL, 10); if (errno == ERANGE) { - opal_output(0, "mca_mpool_sm_init: min_size overflows! set to default (%ld)", default_min); + opal_output(0, "mca_mpool_sm_init: min_size overflows! " + "set to default (%ld)", default_min); min_size = default_min; } else if (errno == EINVAL) { - opal_output(0, "mca_mpool_sm_init: invalid min_size entered. set it to (%ld)", default_min); + opal_output(0, "mca_mpool_sm_init: invalid min_size entered. " + "set it to (%ld)", default_min); min_size = default_min; } /* Make a new mpool module */ mpool_module = - (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t)); + (mca_mpool_sm_module_t *)malloc(sizeof(mca_mpool_sm_module_t)); mca_mpool_sm_module_init(mpool_module); /* set sm_size */ @@ -173,23 +180,26 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( mpool_module->sm_size = min_size; } - /* add something for the control structure */ - mpool_module->sm_size += sizeof(mca_common_sm_module_t); - allocator_component = mca_allocator_component_lookup( mca_mpool_sm_component.sm_allocator_name); /* if specified allocator cannot be loaded - look for an alternative */ - if(NULL == allocator_component) { - if(opal_list_get_size(&mca_allocator_base_components) == 0) { - mca_base_component_list_item_t* item = (mca_base_component_list_item_t*) + if (NULL == allocator_component) { + if (opal_list_get_size(&mca_allocator_base_components) == 0) { + mca_base_component_list_item_t *item = + (mca_base_component_list_item_t *) opal_list_get_first(&mca_allocator_base_components); - allocator_component = (mca_allocator_base_component_t*)item->cli_component; - opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s - using %s\n", - mca_mpool_sm_component.sm_allocator_name, allocator_component->allocator_version.mca_component_name); + allocator_component = + (mca_allocator_base_component_t *)item->cli_component; + opal_output( + 0, "mca_mpool_sm_init: " + "unable to locate allocator: %s - using %s\n", + mca_mpool_sm_component.sm_allocator_name, + allocator_component->allocator_version.mca_component_name); } else { - opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s\n", - mca_mpool_sm_component.sm_allocator_name); + opal_output(0, "mca_mpool_sm_init: " + "unable to locate allocator: %s\n", + mca_mpool_sm_component.sm_allocator_name); free(procs); return NULL; } @@ -197,41 +207,28 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( mpool_module->mem_node = resources->mem_node; - /* create initial shared memory mapping */ - len = asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", - orte_process_info.job_session_dir, - orte_process_info.nodename ); - if ( 0 > len ) { - free(mpool_module); - free(procs); - return NULL; - } - opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: shared memory size used: (%ld)", mpool_module->sm_size); - if (NULL == (mpool_module->sm_common_module = - mca_common_sm_init(procs, num_all_procs, - mpool_module->sm_size, - file_name, + if (NULL == (mpool_module->sm_common_module = + mca_common_sm_module_attach(&resources->bs_meta_buf, sizeof(mca_common_sm_module_t), 8))) { - opal_output(mca_mpool_sm_component.verbose, - "mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name); - free(file_name); + opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: " + "unable to create shared memory mapping (%s)", + resources->bs_meta_buf.seg_name); free(mpool_module); free(procs); return NULL; } free(procs); - free(file_name); /* setup allocator */ mpool_module->sm_allocator = allocator_component->allocator_init(true, mca_common_sm_seg_alloc, NULL, &(mpool_module->super)); - if(NULL == mpool_module->sm_allocator) { + if (NULL == mpool_module->sm_allocator) { opal_output(0, "mca_mpool_sm_init: unable to initialize allocator"); free(mpool_module); return NULL; diff --git a/opal/mca/shmem/mmap/shmem_mmap_module.c b/opal/mca/shmem/mmap/shmem_mmap_module.c index 23e0195a37..c63d633987 100644 --- a/opal/mca/shmem/mmap/shmem_mmap_module.c +++ b/opal/mca/shmem/mmap/shmem_mmap_module.c @@ -122,13 +122,12 @@ opal_shmem_mmap_module_t opal_shmem_mmap_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { + /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting " - "(id: %d, size: %lu, name: %s)\n", + "%s: %s: shmem_ds_resetting\n", mca_shmem_mmap_component.super.base_version.mca_type_name, - mca_shmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + mca_shmem_mmap_component.super.base_version.mca_component_name) ); ds_buf->seg_cpid = 0; diff --git a/opal/mca/shmem/posix/shmem_posix_module.c b/opal/mca/shmem/posix/shmem_posix_module.c index 22789cafb0..0ef9cfe5a3 100644 --- a/opal/mca/shmem/posix/shmem_posix_module.c +++ b/opal/mca/shmem/posix/shmem_posix_module.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * * $COPYRIGHT$ @@ -108,13 +108,12 @@ opal_shmem_posix_module_t opal_shmem_posix_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { + /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting " - "(id: %d, size: %lu, name: %s)\n", + "%s: %s: shmem_ds_resetting\n", mca_shmem_posix_component.super.base_version.mca_type_name, - mca_shmem_posix_component.super.base_version.mca_component_name, - ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + mca_shmem_posix_component.super.base_version.mca_component_name) ); ds_buf->seg_cpid = 0; diff --git a/opal/mca/shmem/shmem_types.h b/opal/mca/shmem/shmem_types.h index b2e165b2be..ee7f9b202d 100644 --- a/opal/mca/shmem/shmem_types.h +++ b/opal/mca/shmem/shmem_types.h @@ -12,7 +12,7 @@ * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -33,6 +33,13 @@ #include "opal_config.h" +#ifdef HAVE_STDDEF_H +#include +#endif /* HAVE_STDDEF_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + BEGIN_C_DECLS /* ////////////////////////////////////////////////////////////////////////// */ @@ -92,7 +99,6 @@ do { \ #define OPAL_SHMEM_DS_IS_VALID(ds_buf) \ ( (ds_buf)->flags & OPAL_SHMEM_DS_FLAGS_VALID ) -/* ////////////////////////////////////////////////////////////////////////// */ typedef uint8_t opal_shmem_ds_flag_t; /* shared memory segment header */ @@ -113,13 +119,35 @@ struct opal_shmem_ds_t { int seg_id; /* size of shared memory segment */ size_t seg_size; - /* path to backing store */ - char seg_name[OPAL_PATH_MAX]; /* base address of shared memory segment */ unsigned char *seg_base_addr; + /* path to backing store -- last element so we can easily calculate the + * "real" size of opal_shmem_ds_t. that is, the amount of the struct that + * is actually being used. for example: if seg_name is something like: + * "foo_baz" and OPAL_PATH_MAX is 4096, we want to know that only a very + * limited amount of the seg_name buffer is actually being used. + */ + char seg_name[OPAL_PATH_MAX]; }; typedef struct opal_shmem_ds_t opal_shmem_ds_t; +/* ////////////////////////////////////////////////////////////////////////// */ +/** + * Simply returns the amount of used space. For use when sending the entire + * opal_shmem_ds_t payload isn't viable -- due to the potential disparity + * between the reserved buffer space and what is actually in use. + */ +static inline size_t +opal_shmem_sizeof_shmem_ds(const opal_shmem_ds_t *ds_bufp) +{ + char *name_base = NULL; + size_t name_buf_offset = offsetof(opal_shmem_ds_t, seg_name); + + name_base = (char *)ds_bufp + name_buf_offset; + + return name_buf_offset + strlen(name_base) + 1; +} + END_C_DECLS #endif /* OPAL_SHMEM_TYPES_H */ diff --git a/opal/mca/shmem/sysv/shmem_sysv_module.c b/opal/mca/shmem/sysv/shmem_sysv_module.c index c0d13f429f..1d8389c938 100644 --- a/opal/mca/shmem/sysv/shmem_sysv_module.c +++ b/opal/mca/shmem/sysv/shmem_sysv_module.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * * $COPYRIGHT$ @@ -113,13 +113,12 @@ opal_shmem_sysv_module_t opal_shmem_sysv_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { + /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting " - "(id: %d, size: %lu, name: %s)\n", + "%s: %s: shmem_ds_resetting\n", mca_shmem_sysv_component.super.base_version.mca_type_name, - mca_shmem_sysv_component.super.base_version.mca_component_name, - ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + mca_shmem_sysv_component.super.base_version.mca_component_name) ); ds_buf->seg_cpid = 0; @@ -195,7 +194,7 @@ segment_create(opal_shmem_ds_t *ds_buf, * real_size here */ if (-1 == (ds_buf->seg_id = shmget(IPC_PRIVATE, real_size, - IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) { + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); diff --git a/opal/mca/shmem/windows/shmem_windows_module.c b/opal/mca/shmem/windows/shmem_windows_module.c index 434906ff53..1971627fbe 100644 --- a/opal/mca/shmem/windows/shmem_windows_module.c +++ b/opal/mca/shmem/windows/shmem_windows_module.c @@ -114,13 +114,12 @@ opal_shmem_windows_module_t opal_shmem_windows_module = { static inline void shmem_ds_reset(opal_shmem_ds_t *ds_buf) { + /* don't print ds_buf info here, as we may be printing garbage. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, - "%s: %s: shmem_ds_resetting " - "(id: %d, size: %"PRIsize_t", name: %s)\n", + "%s: %s: shmem_ds_resetting\n", mca_shmem_windows_component.super.base_version.mca_type_name, - mca_shmem_windows_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_size, ds_buf->seg_name) + mca_shmem_windows_component.super.base_version.mca_component_name) ); ds_buf->seg_cpid = 0;