From 2c802732468f0c73d546cc5a83fec1bca5c0594c Mon Sep 17 00:00:00 2001 From: Samuel Gutierrez Date: Fri, 10 Aug 2012 19:51:41 +0000 Subject: [PATCH] sm BTL initialization via modex. This commit was SVN r26987. --- ompi/mca/btl/sm/btl_sm.c | 145 ++++++++++-------- ompi/mca/btl/sm/btl_sm.h | 13 +- ompi/mca/btl/sm/btl_sm_component.c | 198 ++++++++++++++++++++++--- ompi/mca/btl/sm/help-mpi-btl-sm.txt | 5 + ompi/mca/common/sm/common_sm.c | 89 +++++++---- ompi/mca/common/sm/common_sm.h | 33 ++++- ompi/mca/mpool/sm/mpool_sm.h | 17 ++- ompi/mca/mpool/sm/mpool_sm_component.c | 95 +++++++----- 8 files changed, 431 insertions(+), 164 deletions(-) diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 21c55e1e82..fc2ef4264b 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -46,6 +46,7 @@ #include "orte/util/proc_info.h" #include "opal/datatype/opal_convertor.h" #include "ompi/class/ompi_free_list.h" +#include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/sm/mpool_sm.h" @@ -111,7 +112,6 @@ mca_btl_sm_t mca_btl_sm = { */ #define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE)) - static void *mpool_calloc(size_t nmemb, size_t size) { void *buf; @@ -127,17 +127,43 @@ static void *mpool_calloc(size_t nmemb, size_t size) return buf; } - -static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) +/* + * Returns a pointer to node rank zero. Returns NULL on error. + */ +static inline ompi_proc_t * +get_node_rank_zero_proc_ptr(ompi_proc_t **proc_world, + size_t proc_world_size) { - size_t size, length, length_payload; - char *sm_ctl_file; + size_t num_local_procs = 0; + + if (NULL == proc_world) { + return NULL; + } + /* sort the procs list and get a pointer to the lowest node rank */ + if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(proc_world, + proc_world_size, + &num_local_procs)) { + opal_output(0, "mca_common_sm_local_proc_reorder failure! " + "Cannot continue.\n"); + return NULL; + } + + return proc_world[0]; +} + +static int +sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) +{ + size_t length, length_payload, proc_world_size = 0; sm_fifo_t *my_fifos; int my_mem_node, num_mem_nodes, i; - ompi_proc_t **procs; - size_t num_procs; + /* XXX SKG - malloc! */ mca_mpool_base_resources_t res; mca_btl_sm_component_t* m = &mca_btl_sm_component; + ompi_proc_t **proc_world = NULL; + ompi_proc_t *proc_node_rank_zero = NULL; + mca_btl_sm_modex_t *modex = NULL; + size_t modex_size = 0; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; @@ -191,8 +217,9 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) #endif /* lookup shared memory pool */ - mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes, - sizeof(mca_mpool_base_module_t*)); + mca_btl_sm_component.sm_mpools = + (mca_mpool_base_module_t **)calloc(num_mem_nodes, + sizeof(mca_mpool_base_module_t *)); /* Disable memory binding, because each MPI process will claim pages in the mpool for their local NUMA node */ @@ -227,7 +254,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) } res.size *= n; - /* now, create it */ + /* SKG - mpool create - now, create it */ mca_btl_sm_component.sm_mpools[0] = mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, sm_btl, &res); @@ -248,34 +275,40 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) return OMPI_ERR_OUT_OF_RESOURCE; } - /* Allocate Shared Memory BTL process coordination - * data structure. This will reside in shared memory */ - - /* set file name */ - if (asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", - orte_process_info.job_session_dir, - orte_process_info.nodename) < 0) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* Pass in a data segment alignment of 0 to get no data - segment (only the shared control structure) */ - size = sizeof(mca_common_sm_seg_header_t) + - n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size; - procs = ompi_proc_world(&num_procs); - if (!(mca_btl_sm_component.sm_seg = - mca_common_sm_init(procs, num_procs, size, sm_ctl_file, - sizeof(mca_common_sm_seg_header_t), - opal_cache_line_size))) { - opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory " - "BTL coordinating strucure :: size %lu \n", - (unsigned long)size); - free(procs); - free(sm_ctl_file); + /* now let's receive the modex info from node rank zero. */ + if (NULL == (proc_world = ompi_proc_world(&proc_world_size))) { + opal_output(0, "ompi_proc_world failure! Cannot continue.\n"); return OMPI_ERROR; } - free(procs); - free(sm_ctl_file); + if (NULL == (proc_node_rank_zero = + get_node_rank_zero_proc_ptr(proc_world, proc_world_size))) { + opal_output(0, "get_node_rank_zero_proc_ptr failure! " + "Cannot continue.\n"); + free(proc_world); + return OMPI_ERROR; + } + if (OMPI_SUCCESS != + ompi_modex_recv(&mca_btl_sm_component.super.btl_version, + proc_node_rank_zero, + (void **)&modex, + &modex_size)) { + opal_output(0, "sm_btl_first_time_init: ompi_modex_recv failure!\n"); + free(proc_world); + return OMPI_ERROR; + } + if (NULL == (mca_btl_sm_component.sm_seg = + mca_common_sm_module_attach(&modex->sm_meta_buf, + sizeof(mca_common_sm_seg_header_t), + opal_cache_line_size))) { + /* don't have to detach here, because module_attach cleans up after + * itself on failure. */ + opal_output(0, "sm_btl_first_time_init: " + "mca_common_sm_module_attach failure!\n"); + free(proc_world); + return OMPI_ERROR; + } + free(proc_world); + free(modex); /* check to make sure number of local procs is within the * specified limits */ @@ -374,6 +407,7 @@ static struct mca_btl_base_endpoint_t * create_sm_endpoint(int local_proc, struct ompi_proc_t *proc) { struct mca_btl_base_endpoint_t *ep; + #if OMPI_ENABLE_PROGRESS_THREADS == 1 char path[PATH_MAX]; #endif @@ -401,22 +435,6 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc) return ep; } -static void calc_sm_max_procs(int n) -{ - /* see if need to allocate space for extra procs */ - if(0 > mca_btl_sm_component.sm_max_procs) { - /* no limit */ - if(0 <= mca_btl_sm_component.sm_extra_procs) { - /* limit */ - mca_btl_sm_component.sm_max_procs = - n + mca_btl_sm_component.sm_extra_procs; - } else { - /* no limit */ - mca_btl_sm_component.sm_max_procs = 2 * n; - } - } -} - int mca_btl_sm_add_procs( struct mca_btl_base_module_t* btl, size_t nprocs, @@ -442,7 +460,7 @@ int mca_btl_sm_add_procs( * and idetify procs that are on this host. Add procs on this * host to shared memory reachbility list. Also, get number * of local procs in the procs list. */ - for(proc = 0; proc < (int32_t)nprocs; proc++) { + for (proc = 0; proc < (int32_t)nprocs; proc++) { /* check to see if this proc can be reached via shmem (i.e., if they're on my local host and in my job) */ if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid || @@ -477,18 +495,17 @@ int mca_btl_sm_add_procs( goto CLEANUP; /* make sure that my_smp_rank has been defined */ - if(-1 == my_smp_rank) { + if (-1 == my_smp_rank) { return_code = OMPI_ERROR; goto CLEANUP; } - calc_sm_max_procs(n_local_procs); - if (!sm_btl->btl_inited) { return_code = sm_btl_first_time_init(sm_btl, mca_btl_sm_component.sm_max_procs); - if(return_code != OMPI_SUCCESS) + if (return_code != OMPI_SUCCESS) { goto CLEANUP; + } } /* set local proc's smp rank in the peers structure for @@ -524,13 +541,25 @@ int mca_btl_sm_add_procs( /* Sync with other local procs. Force the FIFO initialization to always * happens before the readers access it. */ - opal_atomic_add_32( &mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); + opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); while( n_local_procs > mca_btl_sm_component.sm_seg->module_seg->seg_inited) { opal_progress(); opal_atomic_rmb(); } + /* it is now safe to unlink the shared memory segment. only one process + * needs to do this, so just let smp rank zero take care of it. */ + if (0 == my_smp_rank) { + if (OMPI_SUCCESS != + mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) { + /* it is "okay" if this fails at this point. we have gone this far, + * so just warn about the failure and continue. this is probably + * only triggered by a programming error. */ + opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); + } + } + /* coordinate with other processes */ for(j = mca_btl_sm_component.num_smp_procs; j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) { diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index 2a02f543b4..ed07d5f832 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -42,6 +42,8 @@ #include "opal/util/bit_ops.h" #include "opal/class/opal_free_list.h" +#include "opal/mca/shmem/shmem.h" + #include "ompi/mca/btl/btl.h" #include "ompi/mca/common/sm/common_sm.h" @@ -121,6 +123,15 @@ typedef struct mca_btl_sm_mem_node_t { mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */ } mca_btl_sm_mem_node_t; +/** + * Shared Memory (SM) BTL modex. + */ +struct mca_btl_sm_modex_t { + opal_shmem_ds_t sm_meta_buf; + opal_shmem_ds_t sm_mpool_meta_buf; +}; +typedef struct mca_btl_sm_modex_t mca_btl_sm_modex_t; + /** * Shared Memory (SM) BTL module. */ diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index bab8487dfe..b7bbdb2ce0 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. @@ -42,15 +42,20 @@ #include /* for mkfifo */ #endif /* HAVE_SYS_STAT_H */ -#include "ompi/constants.h" #include "opal/mca/event/event.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/mca/shmem/shmem.h" #include "opal/util/bit_ops.h" #include "opal/util/output.h" + #include "orte/util/proc_info.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/proc_info.h" -#include "opal/mca/base/mca_base_param.h" +#include "ompi/constants.h" +#include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/btl/base/btl_base_error.h" @@ -351,52 +356,197 @@ CLEANUP: return return_value; } +/* + * Returns the number of processes on the node. + */ +static inline int +get_num_local_procs(void) +{ + /* num_local_peers does not include us in + * its calculation, so adjust for that */ + return (int)(1 + orte_process_info.num_local_peers); +} + +static void +calc_sm_max_procs(int n) +{ + /* see if need to allocate space for extra procs */ + if (0 > mca_btl_sm_component.sm_max_procs) { + /* no limit */ + if (0 <= mca_btl_sm_component.sm_extra_procs) { + /* limit */ + mca_btl_sm_component.sm_max_procs = + n + mca_btl_sm_component.sm_extra_procs; + } else { + /* no limit */ + mca_btl_sm_component.sm_max_procs = 2 * n; + } + } +} + +/* + * Creates the shared-memory segments required for this BTL. One for the sm + * mpool and another for the shared memory store and populates *modex_buf_ptr. + */ +static int +populate_modex_bufp(mca_btl_sm_component_t *comp_ptr, + mca_btl_sm_modex_t *modex_buf_ptr) +{ + int rc = OMPI_SUCCESS; + size_t size = 0; + char *sm_mpool_ctl_file = NULL; + char *sm_ctl_file = NULL; + + /* first generate some unique paths for the shared-memory segments */ + if (asprintf(&sm_mpool_ctl_file, + "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename) < 0) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + if (asprintf(&sm_ctl_file, + "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename) < 0) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto out; + } + + /* calculate the segment size */ + size = sizeof(mca_common_sm_seg_header_t) + + comp_ptr->sm_max_procs * (sizeof(sm_fifo_t *) + + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size; + + /* create the things */ + if (NULL == (comp_ptr->sm_seg = + mca_common_sm_module_create(size, sm_ctl_file, + sizeof(mca_common_sm_seg_header_t), + opal_cache_line_size))) { + opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory " + "BTL coordinating strucure :: size %lu \n", + (unsigned long)size); + rc = OMPI_ERROR; + goto out; + } + + /* now extract and store the shmem_ds info from the returned module */ + if (OPAL_SUCCESS != opal_shmem_ds_copy(&(comp_ptr->sm_seg->shmem_ds), + &(modex_buf_ptr->sm_meta_buf))) { + rc = OMPI_ERROR; + goto out; + } + +out: + if (NULL != sm_mpool_ctl_file) { + free(sm_mpool_ctl_file); + } + if (NULL != sm_ctl_file) { + free(sm_ctl_file); + } + return rc; + +} + +/* + * Creates information required for the sm modex and modex sends it. + */ +static int +send_modex(mca_btl_sm_component_t *comp_ptr) +{ + int rc = OMPI_SUCCESS; + mca_btl_sm_modex_t *sm_modex = NULL; + + if (NULL == (sm_modex = calloc(1, sizeof(*sm_modex)))) { + /* out of resources, so just bail. */ + return OMPI_ERR_OUT_OF_RESOURCE; + } + if (OMPI_SUCCESS != (rc = populate_modex_bufp(comp_ptr, sm_modex))) { + opal_output(0, "send_modex: populate_modex_bufp failure!\n"); + /* rc is set */ + goto out; + } + /* send the modex */ + rc = ompi_modex_send(&comp_ptr->super.btl_version, sm_modex, + sizeof(*sm_modex)); + +out: + if (NULL != sm_modex) { + free(sm_modex); + } + return rc; +} + /* * SM component initialization */ -static mca_btl_base_module_t** mca_btl_sm_component_init( - int *num_btls, - bool enable_progress_threads, - bool enable_mpi_threads) +static mca_btl_base_module_t ** +mca_btl_sm_component_init(int *num_btls, + bool enable_progress_threads, + bool enable_mpi_threads) { mca_btl_base_module_t **btls = NULL; + orte_node_rank_t my_node_rank = ORTE_NODE_RANK_INVALID; #if OMPI_BTL_SM_HAVE_KNEM int rc; #endif *num_btls = 0; - - /* if no session directory was created, then we cannot be used */ - if (!orte_create_session_dirs) { - return NULL; - } - /* lookup/create shared memory pool only when used */ mca_btl_sm_component.sm_mpool = NULL; mca_btl_sm_component.sm_mpool_base = NULL; -#if OMPI_ENABLE_PROGRESS_THREADS == 1 - /* create a named pipe to receive events */ - sprintf( mca_btl_sm_component.sm_fifo_path, - "%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir, - (unsigned long)ORTE_PROC_MY_NAME->vpid ); - if(mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) { - opal_output(0, "mca_btl_sm_component_init: mkfifo failed with errno=%d\n",errno); + /* if no session directory was created, then we cannot be used */ + /* SKG - this isn't true anymore. Some backing facilities don't require a + * file-backed store. Extend shmem to provide this info one day. */ + if (!orte_create_session_dirs) { return NULL; } - mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, O_RDWR); + /* if we don't have locality information, then we cannot be used */ + if (ORTE_NODE_RANK_INVALID == + (my_node_rank = orte_process_info.my_node_rank)) { + orte_show_help("help-mpi-btl-sm.txt", "no locality", true); + return NULL; + } + /* calculate max procs so we can figure out how large to make the + * shared-memory segment. this routine sets component sm_max_procs. */ + calc_sm_max_procs(get_num_local_procs()); + /* let local rank 0 create the shared-memory segments and send shmem info */ + if (0 == my_node_rank) { + if (OMPI_SUCCESS != send_modex(&mca_btl_sm_component)) { + return NULL; + } + } + +#if OMPI_ENABLE_PROGRESS_THREADS == 1 + /* create a named pipe to receive events */ + sprintf(mca_btl_sm_component.sm_fifo_path, + "%s"OPAL_PATH_SEP"sm_fifo.%lu", + orte_process_info.job_session_dir, + (unsigned long)ORTE_PROC_MY_NAME->vpid); + if (mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) { + opal_output(0, "mca_btl_sm_component_init: " + "mkfifo failed with errno=%d\n",errno); + return NULL; + } + mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, + O_RDWR); if(mca_btl_sm_component.sm_fifo_fd < 0) { - opal_output(0, "mca_btl_sm_component_init: open(%s) failed with errno=%d\n", + opal_output(0, "mca_btl_sm_component_init: " + "open(%s) failed with errno=%d\n", mca_btl_sm_component.sm_fifo_path, errno); return NULL; } OBJ_CONSTRUCT(&mca_btl_sm_component.sm_fifo_thread, opal_thread_t); - mca_btl_sm_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_sm_component_event_thread; + mca_btl_sm_component.sm_fifo_thread.t_run = + (opal_thread_fn_t)mca_btl_sm_component_event_thread; opal_thread_start(&mca_btl_sm_component.sm_fifo_thread); #endif - mca_btl_sm_component.sm_btls = (mca_btl_sm_t **) malloc( mca_btl_sm_component.sm_max_btls * sizeof (mca_btl_sm_t *)); + mca_btl_sm_component.sm_btls = + (mca_btl_sm_t **)malloc(mca_btl_sm_component.sm_max_btls * + sizeof(mca_btl_sm_t *)); if (NULL == mca_btl_sm_component.sm_btls) { return NULL; } diff --git a/ompi/mca/btl/sm/help-mpi-btl-sm.txt b/ompi/mca/btl/sm/help-mpi-btl-sm.txt index b6905097d6..2e82a4aac0 100644 --- a/ompi/mca/btl/sm/help-mpi-btl-sm.txt +++ b/ompi/mca/btl/sm/help-mpi-btl-sm.txt @@ -4,6 +4,8 @@ # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -12,6 +14,9 @@ # # This is the US/English help file for Open MPI's shared memory support. # +[no locality] +WARNING: Missing locality info XXX FIXME SKG +# [knem requested but not supported] WARNING: Linux kernel knem support was requested for the shared memory (sm) BTL, but it is not supported. Deactivating the shared memory diff --git a/ompi/mca/common/sm/common_sm.c b/ompi/mca/common/sm/common_sm.c index 889d9c3faa..f7c4d8192b 100644 --- a/ompi/mca/common/sm/common_sm.c +++ b/ompi/mca/common/sm/common_sm.c @@ -42,6 +42,7 @@ #include "opal/align.h" #include "opal/util/argv.h" +#include "opal/mca/shmem/shmem.h" #if OPAL_ENABLE_FT_CR == 1 #include "opal/runtime/opal_cr.h" #endif @@ -133,7 +134,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, map->module_data_addr = addr; map->module_seg_addr = (unsigned char *)seg; - + /* note that size is only used during the first call */ if (first_call) { /* initialize some segment information */ @@ -157,7 +158,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, } /* ////////////////////////////////////////////////////////////////////////// */ -/* api implementation */ +/* api implementation */ /* ////////////////////////////////////////////////////////////////////////// */ /* ////////////////////////////////////////////////////////////////////////// */ @@ -170,7 +171,7 @@ mca_common_sm_module_create(size_t size, mca_common_sm_module_t *map = NULL; opal_shmem_ds_t *seg_meta = NULL; - if (NULL == (seg_meta = (opal_shmem_ds_t *) malloc(sizeof(*seg_meta)))) { + if (NULL == (seg_meta = (opal_shmem_ds_t *)malloc(sizeof(*seg_meta)))) { /* out of resources */ return NULL; } @@ -197,33 +198,40 @@ mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta, size_t size_ctl_structure, size_t data_seg_alignment) { - mca_common_sm_module_t *map = NULL; - /* notice that size is 0 here. it really doesn't matter because size WILL * NOT be used because this is an attach (first_call is false). */ - map = attach_and_init(seg_meta, 0, size_ctl_structure, - data_seg_alignment, false); - - return map; + return attach_and_init(seg_meta, 0, size_ctl_structure, + data_seg_alignment, false); } /* ////////////////////////////////////////////////////////////////////////// */ -mca_common_sm_module_t * -mca_common_sm_init(ompi_proc_t **procs, - size_t num_procs, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +int +mca_common_sm_module_unlink(mca_common_sm_module_t *modp) { - /* indicates whether or not i'm the lowest named process */ - bool lowest_local_proc = false; - mca_common_sm_module_t *map = NULL; - ompi_proc_t *temp_proc = NULL; - bool found_lowest = false; - size_t num_local_procs = 0, p = 0; - opal_shmem_ds_t *seg_meta = NULL; + if (NULL == modp) { + return OMPI_ERROR; + } + if (OPAL_SUCCESS != opal_shmem_unlink(&modp->shmem_ds)) { + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} +/* ////////////////////////////////////////////////////////////////////////// */ +int +mca_common_sm_local_proc_reorder(ompi_proc_t **procs, + size_t num_procs, + size_t *out_num_local_procs) + +{ + size_t num_local_procs = 0; + bool found_lowest = false; + ompi_proc_t *temp_proc = NULL; + size_t p; + + if (NULL == out_num_local_procs || NULL == procs) { + return OMPI_ERR_BAD_PARAM; + } /* o reorder procs array to have all the local procs at the beginning. * o look for the local proc with the lowest name. * o determine the number of local procs. @@ -240,8 +248,7 @@ mca_common_sm_init(ompi_proc_t **procs, /* save this proc */ procs[num_local_procs] = procs[p]; /* if we have a new lowest, swap it with position 0 - * so that procs[0] is always the lowest named proc - */ + * so that procs[0] is always the lowest named proc */ if (OPAL_VALUE2_GREATER == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(procs[p]->proc_name), @@ -257,6 +264,31 @@ mca_common_sm_init(ompi_proc_t **procs, ++num_local_procs; } } + *out_num_local_procs = num_local_procs; + + return OMPI_SUCCESS; +} + +/* ////////////////////////////////////////////////////////////////////////// */ +mca_common_sm_module_t * +mca_common_sm_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + /* indicates whether or not i'm the lowest named process */ + bool lowest_local_proc = false; + mca_common_sm_module_t *map = NULL; + size_t num_local_procs = 0; + opal_shmem_ds_t *seg_meta = NULL; + + if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(procs, + num_procs, + &num_local_procs)) { + return NULL; + } /* if there is less than 2 local processes, there's nothing to do. */ if (num_local_procs < 2) { @@ -270,9 +302,9 @@ mca_common_sm_init(ompi_proc_t **procs, /* determine whether or not i am the lowest local process */ lowest_local_proc = - (0 == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, - ORTE_PROC_MY_NAME, - &(procs[0]->proc_name))); + (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, + ORTE_PROC_MY_NAME, + &(procs[0]->proc_name))); /* figure out if i am the lowest rank in the group. * if so, i will create the shared memory backing store @@ -434,4 +466,3 @@ mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module) } return rc; } - diff --git a/ompi/mca/common/sm/common_sm.h b/ompi/mca/common/sm/common_sm.h index b8fd007e1e..05e2ac9b54 100644 --- a/ompi/mca/common/sm/common_sm.h +++ b/ompi/mca/common/sm/common_sm.h @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -72,6 +72,19 @@ typedef struct mca_common_sm_module_t { OBJ_CLASS_DECLARATION(mca_common_sm_module_t); +/** + * This routine reorders procs array to have all the local procs at the + * beginning and returns the number of local procs through out_num_local_procs. + * The proc with the lowest name is at the beginning of the reordered procs + * array. + * + * @returnvalue OMPI_SUCCESS on success, something else, otherwise. + */ +OMPI_DECLSPEC extern int +mca_common_sm_local_proc_reorder(ompi_proc_t **procs, + size_t num_procs, + size_t *out_num_local_procs); + /** * This routine is used to create a shared memory segment (whether * it's an mmaped file or a SYSV IPC segment). It is assumed that @@ -80,7 +93,7 @@ OBJ_CLASS_DECLARATION(mca_common_sm_module_t); * @returnvalue pointer to control structure at head of shared memory segment. * Returns NULL if an error occurred. */ -mca_common_sm_module_t * +OMPI_DECLSPEC extern mca_common_sm_module_t * mca_common_sm_module_create(size_t size, char *file_name, size_t size_ctl_structure, @@ -96,11 +109,22 @@ mca_common_sm_module_create(size_t size, * @returnvalue pointer to control structure at head of shared memory segment. * Returns NULL if an error occurred. */ -mca_common_sm_module_t * +OMPI_DECLSPEC extern mca_common_sm_module_t * mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta, size_t size_ctl_structure, size_t data_seg_alignment); +/** + * A thin wrapper around opal_shmem_unlink. + * + * @ modp points to an initialized mca_common_sm_module_t. + * + * @returnvalue OMPI_SUCCESS if the operation completed successfully, + * OMPI_ERROR otherwise. + */ +OMPI_DECLSPEC extern int +mca_common_sm_module_unlink(mca_common_sm_module_t *modp); + /** * This routine is used to set up a shared memory segment (whether * it's an mmaped file or a SYSV IPC segment). It is assumed that @@ -164,7 +188,7 @@ mca_common_sm_init_group(ompi_group_t *group, */ OMPI_DECLSPEC extern void * mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, - size_t* size, + size_t *size, mca_mpool_base_registration_t **registration); /** @@ -189,4 +213,3 @@ OMPI_DECLSPEC extern mca_common_sm_module_t *mca_common_sm_module; END_C_DECLS #endif /* _COMMON_SM_H_ */ - diff --git a/ompi/mca/mpool/sm/mpool_sm.h b/ompi/mca/mpool/sm/mpool_sm.h index b46bc044d5..9666b3b63f 100644 --- a/ompi/mca/mpool/sm/mpool_sm.h +++ b/ompi/mca/mpool/sm/mpool_sm.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -28,6 +28,7 @@ #include "ompi_config.h" #include "opal/mca/event/event.h" +#include "opal/mca/shmem/shmem.h" #include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/mpool/mpool.h" @@ -36,17 +37,19 @@ BEGIN_C_DECLS struct mca_mpool_sm_component_t { - mca_mpool_base_component_t super; - /* mca_allocator_base_module_t* sm_allocator; */ - char* sm_allocator_name; - int verbose; - /* struct mca_mpool_sm_mmap_t *sm_mmap; */ + mca_mpool_base_component_t super; + /* mca_allocator_base_module_t* sm_allocator; */ + char *sm_allocator_name; + int verbose; + /* struct mca_mpool_sm_mmap_t *sm_mmap; */ }; typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t; typedef struct mca_mpool_base_resources_t { size_t size; int32_t mem_node; + /* backing store metadata */ + opal_shmem_ds_t bs_meta_buf; } mca_mpool_base_resources_t; OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; @@ -54,7 +57,7 @@ OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; typedef struct mca_mpool_sm_module_t { mca_mpool_base_module_t super; long sm_size; - mca_allocator_base_module_t * sm_allocator; + mca_allocator_base_module_t *sm_allocator; struct mca_mpool_sm_mmap_t *sm_mmap; mca_common_sm_module_t *sm_common_module; int32_t mem_node; diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index bccaf78e38..0bd0f6551b 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -45,10 +45,14 @@ /* * Local functions */ -static int mca_mpool_sm_open(void); -static int mca_mpool_sm_close( void ); -static mca_mpool_base_module_t* mca_mpool_sm_init( - struct mca_mpool_base_resources_t* resources); +static int +mca_mpool_sm_open(void); + +static int +mca_mpool_sm_close(void); + +static mca_mpool_base_module_t * +mca_mpool_sm_init(struct mca_mpool_base_resources_t* resources); mca_mpool_sm_component_t mca_mpool_sm_component = { { @@ -90,8 +94,8 @@ static int mca_mpool_sm_open(void) /* register SM component parameters */ mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "allocator", - "Name of allocator component to use with sm mpool", - false, false, + "Name of allocator component " + "to use with sm mpool", false, false, "bucket", &mca_mpool_sm_component.sm_allocator_name); @@ -100,18 +104,18 @@ static int mca_mpool_sm_open(void) * to be set up to 2GB-1 for 32 bit and much greater for 64 bit. */ asprintf(&size_str, "%ld", default_min); mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, - "min_size", - "Minimum size of the sm mpool shared memory file", - false, false, size_str, &min_size_param); + "min_size", + "Minimum size of the sm mpool shared memory file", + false, false, size_str, &min_size_param); free(size_str); mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, - "verbose", - "Enable verbose output for mpool sm component", - false, false, 0, &value); + "verbose", + "Enable verbose output for mpool sm component", + false, false, 0, &value); if (value != 0) { - mca_mpool_sm_component.verbose = opal_output_open(NULL); + mca_mpool_sm_component.verbose = opal_output_open(NULL); } else { - mca_mpool_sm_component.verbose = -1; + mca_mpool_sm_component.verbose = -1; } return OMPI_SUCCESS; @@ -128,41 +132,46 @@ static int mca_mpool_sm_close( void ) return OMPI_SUCCESS; } -static mca_mpool_base_module_t* mca_mpool_sm_init( - struct mca_mpool_base_resources_t* resources) +static mca_mpool_base_module_t * +mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) { char *file_name; int len; - mca_mpool_sm_module_t* mpool_module; + mca_mpool_sm_module_t *mpool_module; mca_allocator_base_component_t* allocator_component; long min_size; ompi_proc_t **procs; size_t num_all_procs, i, num_local_procs = 0; /* README: this needs to change if procs in different jobs (even - spawned ones) are to talk using shared memory */ - procs = ompi_proc_world(&num_all_procs); + * spawned ones) are to talk using shared memory */ + if (NULL == (procs = ompi_proc_world(&num_all_procs))) { + /* out of resources, so just bail */ + return NULL; + } for (i = 0 ; i < num_all_procs ; ++i) { if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { num_local_procs++; } } - /* parse the min size and validate it */ - /* if other parameters are added, absolutely necessary to reset errno each time */ + /* if other parameters are added, absolutely + * necessary to reset errno each time */ errno = 0; min_size = strtol(min_size_param, (char **)NULL, 10); if (errno == ERANGE) { - opal_output(0, "mca_mpool_sm_init: min_size overflows! set to default (%ld)", default_min); + opal_output(0, "mca_mpool_sm_init: min_size overflows! " + "set to default (%ld)", default_min); min_size = default_min; } else if (errno == EINVAL) { - opal_output(0, "mca_mpool_sm_init: invalid min_size entered. set it to (%ld)", default_min); + opal_output(0, "mca_mpool_sm_init: invalid min_size entered. " + "set it to (%ld)", default_min); min_size = default_min; } /* Make a new mpool module */ mpool_module = - (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t)); + (mca_mpool_sm_module_t *)malloc(sizeof(mca_mpool_sm_module_t)); mca_mpool_sm_module_init(mpool_module); /* set sm_size */ @@ -180,16 +189,22 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( mca_mpool_sm_component.sm_allocator_name); /* if specified allocator cannot be loaded - look for an alternative */ - if(NULL == allocator_component) { - if(opal_list_get_size(&mca_allocator_base_components) == 0) { - mca_base_component_list_item_t* item = (mca_base_component_list_item_t*) + if (NULL == allocator_component) { + if (opal_list_get_size(&mca_allocator_base_components) == 0) { + mca_base_component_list_item_t *item = + (mca_base_component_list_item_t *) opal_list_get_first(&mca_allocator_base_components); - allocator_component = (mca_allocator_base_component_t*)item->cli_component; - opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s - using %s\n", - mca_mpool_sm_component.sm_allocator_name, allocator_component->allocator_version.mca_component_name); + allocator_component = + (mca_allocator_base_component_t *)item->cli_component; + opal_output( + 0, "mca_mpool_sm_init: " + "unable to locate allocator: %s - using %s\n", + mca_mpool_sm_component.sm_allocator_name, + allocator_component->allocator_version.mca_component_name); } else { - opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s\n", - mca_mpool_sm_component.sm_allocator_name); + opal_output(0, "mca_mpool_sm_init: " + "unable to locate allocator: %s\n", + mca_mpool_sm_component.sm_allocator_name); free(procs); return NULL; } @@ -198,10 +213,10 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( mpool_module->mem_node = resources->mem_node; /* create initial shared memory mapping */ - len = asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", - orte_process_info.job_session_dir, - orte_process_info.nodename ); - if ( 0 > len ) { + len = asprintf(&file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename); + if (0 > len) { free(mpool_module); free(procs); return NULL; @@ -216,8 +231,8 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( mpool_module->sm_size, file_name, sizeof(mca_common_sm_module_t), 8))) { - opal_output(mca_mpool_sm_component.verbose, - "mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name); + opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: " + "unable to create shared memory mapping (%s)", file_name); free(file_name); free(mpool_module); free(procs); @@ -231,7 +246,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( allocator_component->allocator_init(true, mca_common_sm_seg_alloc, NULL, &(mpool_module->super)); - if(NULL == mpool_module->sm_allocator) { + if (NULL == mpool_module->sm_allocator) { opal_output(0, "mca_mpool_sm_init: unable to initialize allocator"); free(mpool_module); return NULL;