1
1

sm BTL initialization via modex, as discussed at last year's meeting.

This commit was SVN r27739.
Этот коммит содержится в:
Samuel Gutierrez 2013-01-03 21:52:20 +00:00
родитель 81a8e21939
Коммит a159bfaf25
13 изменённых файлов: 940 добавлений и 256 удалений

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
@ -43,9 +43,12 @@
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"
#include "orte/util/proc_info.h"
#include "opal/datatype/opal_convertor.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/sm/mpool_sm.h"
@ -111,7 +114,6 @@ mca_btl_sm_t mca_btl_sm = {
*/
#define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE))
static void *mpool_calloc(size_t nmemb, size_t size)
{
void *buf;
@ -127,17 +129,163 @@ static void *mpool_calloc(size_t nmemb, size_t size)
return buf;
}
static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
/*
* Returns a pointer to node rank zero. Returns NULL on error.
*/
static ompi_proc_t *
get_node_rank_zero_proc_ptr(ompi_proc_t **proc_world,
size_t proc_world_size)
{
size_t size, length, length_payload;
char *sm_ctl_file;
size_t num_local_procs = 0;
if (NULL == proc_world) {
return NULL;
}
/* sort the procs list and get a pointer to the lowest node rank */
if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(proc_world,
proc_world_size,
&num_local_procs)) {
opal_output(0, "mca_common_sm_local_proc_reorder failure! "
"Cannot continue.\n");
return NULL;
}
return proc_world[0];
}
static int
do_segmented_modex_recv(mca_btl_sm_component_t *comp_ptr,
mca_btl_sm_modex_t **sm_modex_bufp)
{
int member_id = 0, rc = OMPI_ERROR;
size_t segment_size = 0, member_offset = 0;
/* start with the full modex buffer size */
size_t key_len = 0, proc_world_size = 0;
unsigned char *modex_bufp = NULL;
char *key = NULL, *modex_comp_name = NULL;
void *tmp_bufp = NULL;
ompi_proc_t **proc_world = NULL, *proc_node_rank_zero = NULL;
if (NULL == (modex_bufp = calloc(1, sizeof(mca_btl_sm_modex_t)))) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
/* stash the base of the modex buffer, because modex_bufp gets modified */
*sm_modex_bufp = (mca_btl_sm_modex_t *)modex_bufp;
if (NULL == (proc_world = ompi_proc_world(&proc_world_size))) {
opal_output(0, "ompi_proc_world failure! Cannot continue.\n");
rc = OMPI_ERROR;
goto out;
}
if (NULL == (proc_node_rank_zero =
get_node_rank_zero_proc_ptr(proc_world, proc_world_size))) {
opal_output(0, "get_node_rank_zero_proc_ptr failure! "
"Cannot continue.\n");
rc = OMPI_ERROR;
goto out;
}
if (NULL == (modex_comp_name =
mca_base_component_to_string(&comp_ptr->super.btl_version))) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
/* SM_MODEX_STR_PAD to accommodate the member id and key index */
key_len = strlen(modex_comp_name) + SM_MODEX_STR_PAD;
if (NULL == (key = calloc(key_len, sizeof(*key)))) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
/* iterate over all modex members and store their respective pieces */
for (member_id = 0; member_id < SM_MODEX_NUM_MEMBERS; ++member_id) {
if (OMPI_SUCCESS != (rc =
mca_btl_sm_get_modex_member_off_n_size(*sm_modex_bufp, member_id,
&member_offset, NULL))) {
goto out;
}
if (member_id < 2) { /* for mids 0 and 1 */
opal_shmem_ds_t *tmp_ds = calloc(1, sizeof(*tmp_ds));
size_t path_offset = offsetof(opal_shmem_ds_t, seg_name);
char *tmp_pathp = NULL;
if (NULL == tmp_ds) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
(void)snprintf(key, key_len, "%s-%d-%d",
modex_comp_name, member_id, 0);
rc = ompi_modex_recv_string((const char *)key, proc_node_rank_zero,
&tmp_bufp, &segment_size);
(void)memmove(tmp_ds, tmp_bufp, segment_size);
free(tmp_bufp);
/* now copy the path stuff */
(void)snprintf(key, key_len, "%s-%d-%d",
modex_comp_name, member_id, 1);
rc = ompi_modex_recv_key_value((const char *)key, proc_node_rank_zero,
&tmp_bufp, OPAL_STRING);
tmp_pathp = (char *)tmp_bufp;
(void)memmove((unsigned char *)tmp_ds + path_offset,
tmp_pathp, strlen(tmp_pathp) + 1);
modex_bufp = ((unsigned char *)*sm_modex_bufp) + member_offset;
(void)memmove(modex_bufp, tmp_ds, sizeof(*tmp_ds));
free(tmp_ds);
free(tmp_bufp);
}
else { /* for mid 2 */
(void)snprintf(key, key_len, "%s-%d", modex_comp_name, member_id);
rc = ompi_modex_recv_string((const char *)key, proc_node_rank_zero,
&tmp_bufp, &segment_size);
if (OMPI_SUCCESS != rc) {
/* rc is set */
goto out;
}
modex_bufp = ((unsigned char *)*sm_modex_bufp) + member_offset;
(void)memmove(modex_bufp, tmp_bufp, segment_size);
free(tmp_bufp);
}
}
out:
if (NULL != modex_comp_name) {
free(modex_comp_name);
}
if (NULL != key) {
free(key);
}
if (NULL != proc_world) {
free(proc_world);
}
if (OMPI_SUCCESS != rc && NULL != *sm_modex_bufp) {
free(*sm_modex_bufp);
*sm_modex_bufp = NULL;
}
return rc;
}
/*
* Modex receive. Caller is responsible for freeing returned resources.
*/
static inline int
recv_modex(mca_btl_sm_component_t *comp_ptr,
mca_btl_sm_modex_t **out_modex)
{
int rc;
if (OMPI_SUCCESS != (rc = do_segmented_modex_recv(comp_ptr, out_modex))) {
opal_output(0, "recv_modex: do_segmented_modex_recv failure!\n");
}
return rc;
}
static int
sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
int32_t my_smp_rank,
int n)
{
size_t length, length_payload;
sm_fifo_t *my_fifos;
int my_mem_node, num_mem_nodes, i;
ompi_proc_t **procs;
size_t num_procs;
mca_mpool_base_resources_t res;
int my_mem_node, num_mem_nodes, i, rc;
mca_mpool_base_resources_t *res = NULL;
mca_btl_sm_component_t* m = &mca_btl_sm_component;
mca_btl_sm_modex_t *modex = NULL;
/* Assume we don't have hwloc support and fill in dummy info */
mca_btl_sm_component.mem_node = my_mem_node = 0;
@ -190,50 +338,43 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
}
#endif
/* lookup shared memory pool */
mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes,
sizeof(mca_mpool_base_module_t*));
/* Disable memory binding, because each MPI process will claim
pages in the mpool for their local NUMA node */
res.mem_node = -1;
/* determine how much memory to create */
/*
* This heuristic formula mostly says that we request memory for:
* - nfifos FIFOs, each comprising:
* . a sm_fifo_t structure
* . many pointers (fifo_size of them per FIFO)
* - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
* - max fragments (sm_free_list_num of them)
*
* On top of all that, we sprinkle in some number of
* "opal_cache_line_size" additions to account for some
* padding and edge effects that may lie in the allocator.
*/
res.size =
FIFO_MAP_NUM(n) * ( sizeof(sm_fifo_t) + sizeof(void *) * m->fifo_size + 4 * opal_cache_line_size )
+ ( 2 * n + m->sm_free_list_inc ) * ( m->eager_limit + 2 * opal_cache_line_size )
+ m->sm_free_list_num * ( m->max_frag_size + 2 * opal_cache_line_size );
/* before we multiply by n, make sure the result won't overflow */
/* Stick that little pad in, particularly since we'll eventually
* need a little extra space. E.g., in mca_mpool_sm_init() in
* mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
* added.
*/
if ( ((double) res.size) * n > LONG_MAX - 4096 ) {
if (NULL == (res = calloc(1, sizeof(*res)))) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
res.size *= n;
/* now, create it */
/* everyone receive modex information. all but node rank zero attach to the
* segments stored within the modex. remember: node rank zero is already
* attached to sm_seg. */
if (OMPI_SUCCESS != (rc = recv_modex(m, &modex))) {
free(res);
return rc;
}
/* lookup shared memory pool */
mca_btl_sm_component.sm_mpools =
(mca_mpool_base_module_t **)calloc(num_mem_nodes,
sizeof(mca_mpool_base_module_t *));
/* Disable memory binding, because each MPI process will claim pages in the
* mpool for their local NUMA node */
res->mem_node = -1;
res->size = modex->mpool_res_size;
/* copy mpool's modex info into its base resources */
if (OPAL_SUCCESS !=
opal_shmem_ds_copy(&(modex->sm_mpool_meta_buf),
&(res->bs_meta_buf))) {
free(res);
free(modex);
return OMPI_ERROR;
}
/* now that res is fully populated, create the thing */
mca_btl_sm_component.sm_mpools[0] =
mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name,
sm_btl, &res);
sm_btl, res);
/* Sanity check to ensure that we found it */
if (NULL == mca_btl_sm_component.sm_mpools[0]) {
return OMPI_ERR_OUT_OF_RESOURCE;
free(res);
free(modex);
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0];
@ -245,37 +386,27 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**)
calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
if (NULL == mca_btl_sm_component.sm_peers) {
free(res);
free(modex);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Allocate Shared Memory BTL process coordination
* data structure. This will reside in shared memory */
/* set file name */
if (asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s",
orte_process_info.job_session_dir,
orte_process_info.nodename) < 0) {
return OMPI_ERR_OUT_OF_RESOURCE;
if (0 != my_smp_rank) {
if (NULL == (mca_btl_sm_component.sm_seg =
mca_common_sm_module_attach(&modex->sm_meta_buf,
sizeof(mca_common_sm_seg_header_t),
opal_cache_line_size))) {
/* don't have to detach here, because module_attach cleans up after
* itself on failure. */
opal_output(0, "sm_btl_first_time_init: "
"mca_common_sm_module_attach failure!\n");
free(modex);
free(res);
return OMPI_ERROR;
}
}
/* Pass in a data segment alignment of 0 to get no data
segment (only the shared control structure) */
size = sizeof(mca_common_sm_seg_header_t) +
n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size;
procs = ompi_proc_world(&num_procs);
if (!(mca_btl_sm_component.sm_seg =
mca_common_sm_init(procs, num_procs, size, sm_ctl_file,
sizeof(mca_common_sm_seg_header_t),
opal_cache_line_size))) {
opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory "
"BTL coordinating strucure :: size %lu \n",
(unsigned long)size);
free(procs);
free(sm_ctl_file);
return OMPI_ERROR;
}
free(procs);
free(sm_ctl_file);
/* it is now safe to free the modex and the mpool resources */
free(modex);
free(res);
/* check to make sure number of local procs is within the
* specified limits */
@ -374,6 +505,7 @@ static struct mca_btl_base_endpoint_t *
create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
{
struct mca_btl_base_endpoint_t *ep;
#if OMPI_ENABLE_PROGRESS_THREADS == 1
char path[PATH_MAX];
#endif
@ -401,22 +533,6 @@ create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
return ep;
}
static void calc_sm_max_procs(int n)
{
/* see if need to allocate space for extra procs */
if(0 > mca_btl_sm_component.sm_max_procs) {
/* no limit */
if(0 <= mca_btl_sm_component.sm_extra_procs) {
/* limit */
mca_btl_sm_component.sm_max_procs =
n + mca_btl_sm_component.sm_extra_procs;
} else {
/* no limit */
mca_btl_sm_component.sm_max_procs = 2 * n;
}
}
}
int mca_btl_sm_add_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
@ -430,6 +546,9 @@ int mca_btl_sm_add_procs(
mca_btl_sm_t *sm_btl;
bool have_connected_peer = false;
char **bases;
/* for easy access to the mpool_sm_module */
mca_mpool_sm_module_t *sm_mpool_modp = NULL;
/* initializion */
sm_btl = (mca_btl_sm_t *)btl;
@ -442,7 +561,7 @@ int mca_btl_sm_add_procs(
* and idetify procs that are on this host. Add procs on this
* host to shared memory reachbility list. Also, get number
* of local procs in the procs list. */
for(proc = 0; proc < (int32_t)nprocs; proc++) {
for (proc = 0; proc < (int32_t)nprocs; proc++) {
/* check to see if this proc can be reached via shmem (i.e.,
if they're on my local host and in my job) */
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
@ -477,18 +596,18 @@ int mca_btl_sm_add_procs(
goto CLEANUP;
/* make sure that my_smp_rank has been defined */
if(-1 == my_smp_rank) {
if (-1 == my_smp_rank) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
calc_sm_max_procs(n_local_procs);
if (!sm_btl->btl_inited) {
return_code =
sm_btl_first_time_init(sm_btl, mca_btl_sm_component.sm_max_procs);
if(return_code != OMPI_SUCCESS)
sm_btl_first_time_init(sm_btl, my_smp_rank,
mca_btl_sm_component.sm_max_procs);
if (return_code != OMPI_SUCCESS) {
goto CLEANUP;
}
}
/* set local proc's smp rank in the peers structure for
@ -501,6 +620,7 @@ int mca_btl_sm_add_procs(
}
bases = mca_btl_sm_component.shm_bases;
sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool;
/* initialize own FIFOs */
/*
@ -524,13 +644,34 @@ int mca_btl_sm_add_procs(
/* Sync with other local procs. Force the FIFO initialization to always
* happens before the readers access it.
*/
opal_atomic_add_32( &mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1);
opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1);
while( n_local_procs >
mca_btl_sm_component.sm_seg->module_seg->seg_inited) {
opal_progress();
opal_atomic_rmb();
}
/* it is now safe to unlink the shared memory segment. only one process
* needs to do this, so just let smp rank zero take care of it. */
if (0 == my_smp_rank) {
if (OMPI_SUCCESS !=
mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) {
/* it is "okay" if this fails at this point. we have gone this far,
* so just warn about the failure and continue. this is probably
* only triggered by a programming error. */
opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
}
/* SKG - another abstraction violation here, but I don't want to add
* extra code in the sm mpool for further synchronization. */
/* at this point, all processes have attached to the mpool segment. so
* it is safe to unlink it here. */
if (OMPI_SUCCESS !=
mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) {
opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
}
}
/* coordinate with other processes */
for(j = mca_btl_sm_component.num_smp_procs;
j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) {

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
* $COPYRIGHT$
@ -42,6 +42,8 @@
#include "opal/util/bit_ops.h"
#include "opal/class/opal_free_list.h"
#include "opal/mca/shmem/shmem.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/common/sm/common_sm.h"
@ -83,6 +85,10 @@ BEGIN_C_DECLS
line that should hopefully be good in most places. */
#define SM_CACHE_LINE_PAD 128
/* number of members in mca_btl_sm_modex_t */
#define SM_MODEX_NUM_MEMBERS 3
#define SM_MODEX_STR_PAD 32
struct sm_fifo_t {
/* This queue pointer is used only by the heads. */
volatile void **queue;
@ -121,6 +127,58 @@ typedef struct mca_btl_sm_mem_node_t {
mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */
} mca_btl_sm_mem_node_t;
/**
* Shared Memory (SM) BTL modex.
* Please update SM_MODEX_NUM_MEMBERS if the number of members ever changes.
*/
struct mca_btl_sm_modex_t {
/* 0 */
opal_shmem_ds_t sm_meta_buf;
/* 1 */
opal_shmem_ds_t sm_mpool_meta_buf;
/* 2 */
size_t mpool_res_size;
};
typedef struct mca_btl_sm_modex_t mca_btl_sm_modex_t;
static inline int
mca_btl_sm_get_modex_member_off_n_size(const mca_btl_sm_modex_t *bp,
int mid, size_t *out_off,
size_t *out_size) {
switch (mid) {
/* sm_meta_buf */
case 0:
if (NULL != out_off) {
*out_off = offsetof(mca_btl_sm_modex_t, sm_meta_buf);
}
if (NULL != out_size) {
*out_size = opal_shmem_sizeof_shmem_ds(&bp->sm_meta_buf);
}
break;
/* sm_mpool_meta_buf */
case 1:
if (NULL != out_off) {
*out_off = offsetof(mca_btl_sm_modex_t, sm_mpool_meta_buf);
}
if (NULL != out_size) {
*out_size = opal_shmem_sizeof_shmem_ds(&bp->sm_mpool_meta_buf);
}
break;
case 2:
/* mpool_res_size */
if (NULL != out_off) {
*out_off = offsetof(mca_btl_sm_modex_t, mpool_res_size);
}
if (NULL != out_size) {
*out_size = sizeof(bp->mpool_res_size);
}
break;
default:
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
}
return OMPI_SUCCESS;
}
/**
* Shared Memory (SM) BTL module.
*/

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
@ -42,14 +42,18 @@
#include <sys/stat.h> /* for mkfifo */
#endif /* HAVE_SYS_STAT_H */
#include "ompi/constants.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"
#include "opal/util/bit_ops.h"
#include "opal/util/output.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/proc_info.h"
#include "opal/mca/base/mca_base_param.h"
#include "ompi/constants.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/common/sm/common_sm.h"
#include "ompi/mca/btl/base/btl_base_error.h"
@ -356,52 +360,450 @@ CLEANUP:
return return_value;
}
/*
* Returns the number of processes on the node.
*/
static inline int
get_num_local_procs(void)
{
/* num_local_peers does not include us in
* its calculation, so adjust for that */
return (int)(1 + orte_process_info.num_local_peers);
}
static void
calc_sm_max_procs(int n)
{
/* see if need to allocate space for extra procs */
if (0 > mca_btl_sm_component.sm_max_procs) {
/* no limit */
if (0 <= mca_btl_sm_component.sm_extra_procs) {
/* limit */
mca_btl_sm_component.sm_max_procs =
n + mca_btl_sm_component.sm_extra_procs;
} else {
/* no limit */
mca_btl_sm_component.sm_max_procs = 2 * n;
}
}
}
static int
create_and_attach(mca_btl_sm_component_t *comp_ptr,
size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment,
mca_common_sm_module_t **out_modp)
{
if (NULL == (*out_modp =
mca_common_sm_module_create_and_attach(size, file_name,
size_ctl_structure,
data_seg_alignment))) {
opal_output(0, "create_and_attach: unable to create shared memory "
"BTL coordinating strucure :: size %lu \n",
(unsigned long)size);
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/*
* SKG - I'm not happy with this, but I can't figure out a better way of
* finding the sm mpool's minimum size 8-|. The way I see it. This BTL only
* uses the sm mpool, so maybe this isn't so bad...
*
* The problem is the we need to size the mpool resources at sm BTL component
* init. That means we need to know the mpool's minimum size at create.
*/
static int
get_min_mpool_size(mca_btl_sm_component_t *comp_ptr,
size_t *out_size)
{
char *type_name = "mpool";
char *param_name = "min_size";
char *min_size = NULL;
int id = 0;
size_t default_min = 67108864;
size_t size = 0;
long tmp_size = 0;
if (0 > (id = mca_base_param_find(type_name, comp_ptr->sm_mpool_name,
param_name))) {
opal_output(0, "mca_base_param_find: failure looking for %s_%s_%s\n",
type_name, comp_ptr->sm_mpool_name, param_name);
return OMPI_ERR_NOT_FOUND;
}
if (OPAL_ERROR == mca_base_param_lookup_string(id, &min_size)) {
opal_output(0, "mca_base_param_lookup_string failure\n");
return OMPI_ERROR;
}
errno = 0;
tmp_size = strtol(min_size, (char **)NULL, 10);
if (ERANGE == errno || EINVAL == errno || tmp_size <= 0) {
opal_output(0, "mca_btl_sm::get_min_mpool_size: "
"Unusable %s_%s_min_size provided. "
"Continuing with %lu.", type_name,
comp_ptr->sm_mpool_name,
(unsigned long)default_min);
size = default_min;
}
else {
size = (size_t)tmp_size;
}
free(min_size);
*out_size = size;
return OMPI_SUCCESS;
}
static int
get_mpool_res_size(int32_t max_procs,
size_t *out_res_size)
{
size_t size = 0;
/* determine how much memory to create */
/*
* This heuristic formula mostly says that we request memory for:
* - nfifos FIFOs, each comprising:
* . a sm_fifo_t structure
* . many pointers (fifo_size of them per FIFO)
* - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
* - max fragments (sm_free_list_num of them)
*
* On top of all that, we sprinkle in some number of
* "opal_cache_line_size" additions to account for some
* padding and edge effects that may lie in the allocator.
*/
size = FIFO_MAP_NUM(max_procs) *
(sizeof(sm_fifo_t) + sizeof(void *) *
mca_btl_sm_component.fifo_size + 4 * opal_cache_line_size) +
(2 * max_procs + mca_btl_sm_component.sm_free_list_inc) *
(mca_btl_sm_component.eager_limit + 2 * opal_cache_line_size) +
mca_btl_sm_component.sm_free_list_num *
(mca_btl_sm_component.max_frag_size + 2 * opal_cache_line_size);
/* add something for the control structure */
size += sizeof(mca_common_sm_module_t);
/* before we multiply by max_procs, make sure the result won't overflow */
/* Stick that little pad in, particularly since we'll eventually
* need a little extra space. E.g., in mca_mpool_sm_init() in
* mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
* added.
*/
if (((double)size) * max_procs > LONG_MAX - 4096) {
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
}
size *= (size_t)max_procs;
*out_res_size = size;
return OMPI_SUCCESS;
}
/*
* Creates the shared-memory segments required for this BTL. One for the sm
* mpool and another for the shared memory store and populates *modex_buf_ptr.
*
* it is assumed that calc_sm_max_procs has already been called (sets
* sm_max_procs).
*/
static int
populate_modex_bufp(mca_btl_sm_component_t *comp_ptr,
mca_btl_sm_modex_t *modex_buf_ptr)
{
int rc = OMPI_SUCCESS;
size_t size = 0;
size_t min_size = 0;
char *sm_mpool_ctl_file = NULL;
char *sm_ctl_file = NULL;
/* used as a temporary store so we can extract shmem_ds info */
mca_common_sm_module_t *tmp_modp = NULL;
/* first generate some unique paths for the shared-memory segments that
* this BTL needs. */
if (asprintf(&sm_mpool_ctl_file,
"%s"OPAL_PATH_SEP"shared_mem_pool.%s",
orte_process_info.job_session_dir,
orte_process_info.nodename) < 0) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
if (asprintf(&sm_ctl_file,
"%s"OPAL_PATH_SEP"shared_mem_btl_module.%s",
orte_process_info.job_session_dir,
orte_process_info.nodename) < 0) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
/* create the things */
/* === sm mpool === */
/* get the segment size for the sm mpool. */
if (OMPI_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs,
&size))) {
/* rc is already set */
goto out;
}
/* do we need to update the size based on the sm mpool's min size? */
if (OMPI_SUCCESS != (rc = get_min_mpool_size(comp_ptr, &min_size))) {
goto out;
}
if (size < min_size) {
size = min_size;
}
/* we only need the shmem_ds info at this point. initilization will be
* completed in the mpool module code. the idea is that we just need this
* info so we can populate the modex. */
if (OMPI_SUCCESS != (rc =
create_and_attach(comp_ptr, size, sm_mpool_ctl_file,
sizeof(mca_common_sm_module_t), 8, &tmp_modp))) {
/* rc is set */
goto out;
}
/* now extract and store the shmem_ds info from the returned module */
if (OPAL_SUCCESS !=
opal_shmem_ds_copy(&(tmp_modp->shmem_ds),
&(modex_buf_ptr->sm_mpool_meta_buf))) {
rc = OMPI_ERROR;
goto out;
}
/* set the mpool_res_size in the modex */
modex_buf_ptr->mpool_res_size = size;
/* === sm btl === */
/* calculate the segment size. */
size = sizeof(mca_common_sm_seg_header_t) +
comp_ptr->sm_max_procs *
(sizeof(sm_fifo_t *) +
sizeof(char *) + sizeof(uint16_t)) +
opal_cache_line_size;
if (OMPI_SUCCESS != (rc =
create_and_attach(comp_ptr, size, sm_ctl_file,
sizeof(mca_common_sm_seg_header_t),
opal_cache_line_size, &comp_ptr->sm_seg))) {
/* rc is set */
goto out;
}
/* now extract and store the shmem_ds info from the returned module */
if (OPAL_SUCCESS != opal_shmem_ds_copy(&(comp_ptr->sm_seg->shmem_ds),
&(modex_buf_ptr->sm_meta_buf))) {
rc = OMPI_ERROR;
goto out;
}
out:
if (NULL != sm_mpool_ctl_file) {
free(sm_mpool_ctl_file);
}
if (NULL != sm_ctl_file) {
free(sm_ctl_file);
}
return rc;
}
static int
send_member(char *key_prefix,
unsigned char *member_basep,
size_t extent,
int member_id)
{
char *key = NULL;
int rc = OMPI_ERROR;
size_t shmem_path_offset = 0;
switch (member_id) {
case 0:
case 1:
shmem_path_offset = offsetof(opal_shmem_ds_t, seg_name);
if (-1 == asprintf(&key, "%s-%d", key_prefix, 0)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (OMPI_SUCCESS != (rc =
ompi_modex_send_string((const char *)key,
member_basep, shmem_path_offset))) {
free(key);
return rc;
}
free(key);
if (-1 == asprintf(&key, "%s-%d", key_prefix, 1)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* using ompi_modex_send_key_value here, so the data isn't encoded
* if using PMI grpcomm. */
if (OMPI_SUCCESS != (rc =
ompi_modex_send_key_value(key,
(member_basep + shmem_path_offset),
OPAL_STRING))) {
free(key);
return rc;
}
free(key);
return OMPI_SUCCESS;
case 2:
if (OMPI_SUCCESS != (rc =
ompi_modex_send_string((const char *)key_prefix,
member_basep, extent))) {
free(key);
return rc;
}
return OMPI_SUCCESS;
default:
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
}
return OMPI_ERROR;
}
static int
send_all_modex_members(mca_btl_sm_component_t *comp_ptr,
mca_btl_sm_modex_t *modex_bufp)
{
size_t offset = 0, extent = 0;
unsigned char *datap = (unsigned char *)modex_bufp;
unsigned char *tmp_base = NULL;
char *modex_comp_name = NULL;
int rc, mid;
char *key;
if (NULL == (modex_comp_name =
mca_base_component_to_string(&comp_ptr->super.btl_version))) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* iterate over all the modex members and pack the data into one message
* buffer */
for (mid = 0; mid < SM_MODEX_NUM_MEMBERS; ++mid) {
if (OMPI_SUCCESS != (rc =
mca_btl_sm_get_modex_member_off_n_size(modex_bufp, mid,
&offset, &extent))) {
/* rc is set */
goto out;
}
tmp_base = (unsigned char *)datap + offset;
if (-1 == asprintf(&key, "%s-%d", modex_comp_name, mid)) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto out;
}
if (OMPI_SUCCESS != (rc = send_member(key, tmp_base, extent, mid))) {
free(key);
goto out;
}
free(key);
}
out:
if (NULL != modex_comp_name) {
free(modex_comp_name);
}
return rc;
}
/*
* Creates information required for the sm modex and modex sends it.
*/
static int
send_modex(mca_btl_sm_component_t *comp_ptr,
orte_node_rank_t node_rank)
{
int rc = OMPI_SUCCESS;
mca_btl_sm_modex_t *sm_modex = NULL;
/* only node rank zero needs to send modex info */
if (0 != node_rank) {
return OMPI_SUCCESS;
}
if (NULL == (sm_modex = calloc(1, sizeof(*sm_modex)))) {
/* out of resources, so just bail. */
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (OMPI_SUCCESS != (rc = populate_modex_bufp(comp_ptr, sm_modex))) {
opal_output(0, "send_modex: populate_modex_bufp failure!\n");
/* rc is set */
goto out;
}
rc = send_all_modex_members(comp_ptr, sm_modex);
out:
if (NULL != sm_modex) {
free(sm_modex);
}
return rc;
}
/*
* SM component initialization
*/
static mca_btl_base_module_t** mca_btl_sm_component_init(
int *num_btls,
bool enable_progress_threads,
bool enable_mpi_threads)
static mca_btl_base_module_t **
mca_btl_sm_component_init(int *num_btls,
bool enable_progress_threads,
bool enable_mpi_threads)
{
int num_local_procs = 0;
mca_btl_base_module_t **btls = NULL;
orte_node_rank_t my_node_rank = ORTE_NODE_RANK_INVALID;
#if OMPI_BTL_SM_HAVE_KNEM
int rc;
#endif
*num_btls = 0;
/* if no session directory was created, then we cannot be used */
if (!orte_create_session_dirs) {
return NULL;
}
/* lookup/create shared memory pool only when used */
mca_btl_sm_component.sm_mpool = NULL;
mca_btl_sm_component.sm_mpool_base = NULL;
#if OMPI_ENABLE_PROGRESS_THREADS == 1
/* create a named pipe to receive events */
sprintf( mca_btl_sm_component.sm_fifo_path,
"%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir,
(unsigned long)ORTE_PROC_MY_NAME->vpid );
if(mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) {
opal_output(0, "mca_btl_sm_component_init: mkfifo failed with errno=%d\n",errno);
/* if no session directory was created, then we cannot be used */
/* SKG - this isn't true anymore. Some backing facilities don't require a
* file-backed store. Extend shmem to provide this info one day. */
if (!orte_create_session_dirs) {
return NULL;
}
mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path, O_RDWR);
/* if we don't have locality information, then we cannot be used */
if (ORTE_NODE_RANK_INVALID ==
(my_node_rank = orte_process_info.my_node_rank)) {
orte_show_help("help-mpi-btl-sm.txt", "no locality", true);
return NULL;
}
/* no use trying to use sm with less than two procs, so just bail. */
if ((num_local_procs = get_num_local_procs()) < 2) {
return NULL;
}
/* calculate max procs so we can figure out how large to make the
* shared-memory segment. this routine sets component sm_max_procs. */
calc_sm_max_procs(num_local_procs);
if (OMPI_SUCCESS != send_modex(&mca_btl_sm_component, my_node_rank)) {
return NULL;
}
#if OMPI_ENABLE_PROGRESS_THREADS == 1
/* create a named pipe to receive events */
sprintf(mca_btl_sm_component.sm_fifo_path,
"%s"OPAL_PATH_SEP"sm_fifo.%lu",
orte_process_info.job_session_dir,
(unsigned long)ORTE_PROC_MY_NAME->vpid);
if (mkfifo(mca_btl_sm_component.sm_fifo_path, 0660) < 0) {
opal_output(0, "mca_btl_sm_component_init: "
"mkfifo failed with errno=%d\n",errno);
return NULL;
}
mca_btl_sm_component.sm_fifo_fd = open(mca_btl_sm_component.sm_fifo_path,
O_RDWR);
if(mca_btl_sm_component.sm_fifo_fd < 0) {
opal_output(0, "mca_btl_sm_component_init: open(%s) failed with errno=%d\n",
opal_output(0, "mca_btl_sm_component_init: "
"open(%s) failed with errno=%d\n",
mca_btl_sm_component.sm_fifo_path, errno);
return NULL;
}
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_fifo_thread, opal_thread_t);
mca_btl_sm_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_sm_component_event_thread;
mca_btl_sm_component.sm_fifo_thread.t_run =
(opal_thread_fn_t)mca_btl_sm_component_event_thread;
opal_thread_start(&mca_btl_sm_component.sm_fifo_thread);
#endif
mca_btl_sm_component.sm_btls = (mca_btl_sm_t **) malloc( mca_btl_sm_component.sm_max_btls * sizeof (mca_btl_sm_t *));
mca_btl_sm_component.sm_btls =
(mca_btl_sm_t **)malloc(mca_btl_sm_component.sm_max_btls *
sizeof(mca_btl_sm_t *));
if (NULL == mca_btl_sm_component.sm_btls) {
return NULL;
}

Просмотреть файл

@ -4,6 +4,8 @@
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -12,6 +14,10 @@
#
# This is the US/English help file for Open MPI's shared memory support.
#
[no locality]
WARNING: Missing locality information required for sm initialization.
Continuing without shared memory support.
#
[knem requested but not supported]
WARNING: Linux kernel knem support was requested for the shared memory
(sm) BTL, but it is not supported. Deactivating the shared memory

Просмотреть файл

@ -42,6 +42,7 @@
#include "opal/align.h"
#include "opal/util/argv.h"
#include "opal/mca/shmem/shmem.h"
#if OPAL_ENABLE_FT_CR == 1
#include "opal/runtime/opal_cr.h"
#endif
@ -133,7 +134,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp,
map->module_data_addr = addr;
map->module_seg_addr = (unsigned char *)seg;
/* note that size is only used during the first call */
if (first_call) {
/* initialize some segment information */
@ -157,20 +158,20 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp,
}
/* ////////////////////////////////////////////////////////////////////////// */
/* api implementation */
/* api implementation */
/* ////////////////////////////////////////////////////////////////////////// */
/* ////////////////////////////////////////////////////////////////////////// */
mca_common_sm_module_t *
mca_common_sm_module_create(size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment)
mca_common_sm_module_create_and_attach(size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment)
{
mca_common_sm_module_t *map = NULL;
opal_shmem_ds_t *seg_meta = NULL;
if (NULL == (seg_meta = (opal_shmem_ds_t *) malloc(sizeof(*seg_meta)))) {
if (NULL == (seg_meta = (opal_shmem_ds_t *)malloc(sizeof(*seg_meta)))) {
/* out of resources */
return NULL;
}
@ -197,33 +198,39 @@ mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta,
size_t size_ctl_structure,
size_t data_seg_alignment)
{
mca_common_sm_module_t *map = NULL;
/* notice that size is 0 here. it really doesn't matter because size WILL
* NOT be used because this is an attach (first_call is false). */
map = attach_and_init(seg_meta, 0, size_ctl_structure,
data_seg_alignment, false);
return map;
return attach_and_init(seg_meta, 0, size_ctl_structure,
data_seg_alignment, false);
}
/* ////////////////////////////////////////////////////////////////////////// */
mca_common_sm_module_t *
mca_common_sm_init(ompi_proc_t **procs,
size_t num_procs,
size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment)
int
mca_common_sm_module_unlink(mca_common_sm_module_t *modp)
{
/* indicates whether or not i'm the lowest named process */
bool lowest_local_proc = false;
mca_common_sm_module_t *map = NULL;
ompi_proc_t *temp_proc = NULL;
bool found_lowest = false;
size_t num_local_procs = 0, p = 0;
opal_shmem_ds_t *seg_meta = NULL;
if (NULL == modp) {
return OMPI_ERROR;
}
if (OPAL_SUCCESS != opal_shmem_unlink(&modp->shmem_ds)) {
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/* ////////////////////////////////////////////////////////////////////////// */
int
mca_common_sm_local_proc_reorder(ompi_proc_t **procs,
size_t num_procs,
size_t *out_num_local_procs)
{
size_t num_local_procs = 0;
bool found_lowest = false;
ompi_proc_t *temp_proc = NULL;
size_t p;
if (NULL == out_num_local_procs || NULL == procs) {
return OMPI_ERR_BAD_PARAM;
}
/* o reorder procs array to have all the local procs at the beginning.
* o look for the local proc with the lowest name.
* o determine the number of local procs.
@ -240,8 +247,7 @@ mca_common_sm_init(ompi_proc_t **procs,
/* save this proc */
procs[num_local_procs] = procs[p];
/* if we have a new lowest, swap it with position 0
* so that procs[0] is always the lowest named proc
*/
* so that procs[0] is always the lowest named proc */
if (OPAL_VALUE2_GREATER ==
orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
&(procs[p]->proc_name),
@ -257,6 +263,31 @@ mca_common_sm_init(ompi_proc_t **procs,
++num_local_procs;
}
}
*out_num_local_procs = num_local_procs;
return OMPI_SUCCESS;
}
/* ////////////////////////////////////////////////////////////////////////// */
mca_common_sm_module_t *
mca_common_sm_init(ompi_proc_t **procs,
size_t num_procs,
size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment)
{
/* indicates whether or not i'm the lowest named process */
bool lowest_local_proc = false;
mca_common_sm_module_t *map = NULL;
size_t num_local_procs = 0;
opal_shmem_ds_t *seg_meta = NULL;
if (OMPI_SUCCESS != mca_common_sm_local_proc_reorder(procs,
num_procs,
&num_local_procs)) {
return NULL;
}
/* if there is less than 2 local processes, there's nothing to do. */
if (num_local_procs < 2) {
@ -270,9 +301,9 @@ mca_common_sm_init(ompi_proc_t **procs,
/* determine whether or not i am the lowest local process */
lowest_local_proc =
(0 == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
ORTE_PROC_MY_NAME,
&(procs[0]->proc_name)));
(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
ORTE_PROC_MY_NAME,
&(procs[0]->proc_name)));
/* figure out if i am the lowest rank in the group.
* if so, i will create the shared memory backing store
@ -434,4 +465,3 @@ mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module)
}
return rc;
}

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -73,18 +73,31 @@ typedef struct mca_common_sm_module_t {
OBJ_CLASS_DECLARATION(mca_common_sm_module_t);
/**
* This routine is used to create a shared memory segment (whether
* it's an mmaped file or a SYSV IPC segment). It is assumed that
* This routine reorders procs array to have all the local procs at the
* beginning and returns the number of local procs through out_num_local_procs.
* The proc with the lowest name is at the beginning of the reordered procs
* array.
*
* @returnvalue OMPI_SUCCESS on success, something else, otherwise.
*/
OMPI_DECLSPEC extern int
mca_common_sm_local_proc_reorder(ompi_proc_t **procs,
size_t num_procs,
size_t *out_num_local_procs);
/**
* This routine is used to create and attach to a shared memory segment
* (whether it's an mmaped file or a SYSV IPC segment). It is assumed that
* the shared memory segment does not exist before this call.
*
* @returnvalue pointer to control structure at head of shared memory segment.
* Returns NULL if an error occurred.
*/
mca_common_sm_module_t *
mca_common_sm_module_create(size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment);
OMPI_DECLSPEC extern mca_common_sm_module_t *
mca_common_sm_module_create_and_attach(size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment);
/**
* This routine is used to attach to the shared memory segment associated with
@ -96,11 +109,22 @@ mca_common_sm_module_create(size_t size,
* @returnvalue pointer to control structure at head of shared memory segment.
* Returns NULL if an error occurred.
*/
mca_common_sm_module_t *
OMPI_DECLSPEC extern mca_common_sm_module_t *
mca_common_sm_module_attach(opal_shmem_ds_t *seg_meta,
size_t size_ctl_structure,
size_t data_seg_alignment);
/**
* A thin wrapper around opal_shmem_unlink.
*
* @ modp points to an initialized mca_common_sm_module_t.
*
* @returnvalue OMPI_SUCCESS if the operation completed successfully,
* OMPI_ERROR otherwise.
*/
OMPI_DECLSPEC extern int
mca_common_sm_module_unlink(mca_common_sm_module_t *modp);
/**
* This routine is used to set up a shared memory segment (whether
* it's an mmaped file or a SYSV IPC segment). It is assumed that
@ -164,7 +188,7 @@ mca_common_sm_init_group(ompi_group_t *group,
*/
OMPI_DECLSPEC extern void *
mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool,
size_t* size,
size_t *size,
mca_mpool_base_registration_t **registration);
/**
@ -189,4 +213,3 @@ OMPI_DECLSPEC extern mca_common_sm_module_t *mca_common_sm_module;
END_C_DECLS
#endif /* _COMMON_SM_H_ */

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -28,6 +28,7 @@
#include "ompi_config.h"
#include "opal/mca/event/event.h"
#include "opal/mca/shmem/shmem.h"
#include "ompi/mca/common/sm/common_sm.h"
#include "ompi/mca/mpool/mpool.h"
@ -36,17 +37,19 @@
BEGIN_C_DECLS
struct mca_mpool_sm_component_t {
mca_mpool_base_component_t super;
/* mca_allocator_base_module_t* sm_allocator; */
char* sm_allocator_name;
int verbose;
/* struct mca_mpool_sm_mmap_t *sm_mmap; */
mca_mpool_base_component_t super;
/* mca_allocator_base_module_t* sm_allocator; */
char *sm_allocator_name;
int verbose;
/* struct mca_mpool_sm_mmap_t *sm_mmap; */
};
typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t;
typedef struct mca_mpool_base_resources_t {
size_t size;
int32_t mem_node;
/* backing store metadata */
opal_shmem_ds_t bs_meta_buf;
} mca_mpool_base_resources_t;
OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component;
@ -54,7 +57,7 @@ OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component;
typedef struct mca_mpool_sm_module_t {
mca_mpool_base_module_t super;
long sm_size;
mca_allocator_base_module_t * sm_allocator;
mca_allocator_base_module_t *sm_allocator;
struct mca_mpool_sm_mmap_t *sm_mmap;
mca_common_sm_module_t *sm_common_module;
int32_t mem_node;

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -45,10 +45,14 @@
/*
* Local functions
*/
static int mca_mpool_sm_open(void);
static int mca_mpool_sm_close( void );
static mca_mpool_base_module_t* mca_mpool_sm_init(
struct mca_mpool_base_resources_t* resources);
static int
mca_mpool_sm_open(void);
static int
mca_mpool_sm_close(void);
static mca_mpool_base_module_t *
mca_mpool_sm_init(struct mca_mpool_base_resources_t* resources);
mca_mpool_sm_component_t mca_mpool_sm_component = {
{
@ -90,8 +94,8 @@ static int mca_mpool_sm_open(void)
/* register SM component parameters */
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
"allocator",
"Name of allocator component to use with sm mpool",
false, false,
"Name of allocator component "
"to use with sm mpool", false, false,
"bucket",
&mca_mpool_sm_component.sm_allocator_name);
@ -100,18 +104,18 @@ static int mca_mpool_sm_open(void)
* to be set up to 2GB-1 for 32 bit and much greater for 64 bit. */
asprintf(&size_str, "%ld", default_min);
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
"min_size",
"Minimum size of the sm mpool shared memory file",
false, false, size_str, &min_size_param);
"min_size",
"Minimum size of the sm mpool shared memory file",
false, false, size_str, &min_size_param);
free(size_str);
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
"verbose",
"Enable verbose output for mpool sm component",
false, false, 0, &value);
"verbose",
"Enable verbose output for mpool sm component",
false, false, 0, &value);
if (value != 0) {
mca_mpool_sm_component.verbose = opal_output_open(NULL);
mca_mpool_sm_component.verbose = opal_output_open(NULL);
} else {
mca_mpool_sm_component.verbose = -1;
mca_mpool_sm_component.verbose = -1;
}
return OMPI_SUCCESS;
@ -128,41 +132,44 @@ static int mca_mpool_sm_close( void )
return OMPI_SUCCESS;
}
static mca_mpool_base_module_t* mca_mpool_sm_init(
struct mca_mpool_base_resources_t* resources)
static mca_mpool_base_module_t *
mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources)
{
char *file_name;
int len;
mca_mpool_sm_module_t* mpool_module;
mca_mpool_sm_module_t *mpool_module;
mca_allocator_base_component_t* allocator_component;
long min_size;
ompi_proc_t **procs;
size_t num_all_procs, i, num_local_procs = 0;
/* README: this needs to change if procs in different jobs (even
spawned ones) are to talk using shared memory */
procs = ompi_proc_world(&num_all_procs);
* spawned ones) are to talk using shared memory */
if (NULL == (procs = ompi_proc_world(&num_all_procs))) {
/* out of resources, so just bail */
return NULL;
}
for (i = 0 ; i < num_all_procs ; ++i) {
if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) {
num_local_procs++;
}
}
/* parse the min size and validate it */
/* if other parameters are added, absolutely necessary to reset errno each time */
/* if other parameters are added, absolutely
* necessary to reset errno each time */
errno = 0;
min_size = strtol(min_size_param, (char **)NULL, 10);
if (errno == ERANGE) {
opal_output(0, "mca_mpool_sm_init: min_size overflows! set to default (%ld)", default_min);
opal_output(0, "mca_mpool_sm_init: min_size overflows! "
"set to default (%ld)", default_min);
min_size = default_min;
} else if (errno == EINVAL) {
opal_output(0, "mca_mpool_sm_init: invalid min_size entered. set it to (%ld)", default_min);
opal_output(0, "mca_mpool_sm_init: invalid min_size entered. "
"set it to (%ld)", default_min);
min_size = default_min;
}
/* Make a new mpool module */
mpool_module =
(mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t));
(mca_mpool_sm_module_t *)malloc(sizeof(mca_mpool_sm_module_t));
mca_mpool_sm_module_init(mpool_module);
/* set sm_size */
@ -173,23 +180,26 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
mpool_module->sm_size = min_size;
}
/* add something for the control structure */
mpool_module->sm_size += sizeof(mca_common_sm_module_t);
allocator_component = mca_allocator_component_lookup(
mca_mpool_sm_component.sm_allocator_name);
/* if specified allocator cannot be loaded - look for an alternative */
if(NULL == allocator_component) {
if(opal_list_get_size(&mca_allocator_base_components) == 0) {
mca_base_component_list_item_t* item = (mca_base_component_list_item_t*)
if (NULL == allocator_component) {
if (opal_list_get_size(&mca_allocator_base_components) == 0) {
mca_base_component_list_item_t *item =
(mca_base_component_list_item_t *)
opal_list_get_first(&mca_allocator_base_components);
allocator_component = (mca_allocator_base_component_t*)item->cli_component;
opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s - using %s\n",
mca_mpool_sm_component.sm_allocator_name, allocator_component->allocator_version.mca_component_name);
allocator_component =
(mca_allocator_base_component_t *)item->cli_component;
opal_output(
0, "mca_mpool_sm_init: "
"unable to locate allocator: %s - using %s\n",
mca_mpool_sm_component.sm_allocator_name,
allocator_component->allocator_version.mca_component_name);
} else {
opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s\n",
mca_mpool_sm_component.sm_allocator_name);
opal_output(0, "mca_mpool_sm_init: "
"unable to locate allocator: %s\n",
mca_mpool_sm_component.sm_allocator_name);
free(procs);
return NULL;
}
@ -197,41 +207,28 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
mpool_module->mem_node = resources->mem_node;
/* create initial shared memory mapping */
len = asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
orte_process_info.job_session_dir,
orte_process_info.nodename );
if ( 0 > len ) {
free(mpool_module);
free(procs);
return NULL;
}
opal_output(mca_mpool_sm_component.verbose,
"mca_mpool_sm_init: shared memory size used: (%ld)",
mpool_module->sm_size);
if (NULL == (mpool_module->sm_common_module =
mca_common_sm_init(procs, num_all_procs,
mpool_module->sm_size,
file_name,
if (NULL == (mpool_module->sm_common_module =
mca_common_sm_module_attach(&resources->bs_meta_buf,
sizeof(mca_common_sm_module_t), 8))) {
opal_output(mca_mpool_sm_component.verbose,
"mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name);
free(file_name);
opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: "
"unable to create shared memory mapping (%s)",
resources->bs_meta_buf.seg_name);
free(mpool_module);
free(procs);
return NULL;
}
free(procs);
free(file_name);
/* setup allocator */
mpool_module->sm_allocator =
allocator_component->allocator_init(true,
mca_common_sm_seg_alloc,
NULL, &(mpool_module->super));
if(NULL == mpool_module->sm_allocator) {
if (NULL == mpool_module->sm_allocator) {
opal_output(0, "mca_mpool_sm_init: unable to initialize allocator");
free(mpool_module);
return NULL;

Просмотреть файл

@ -122,13 +122,12 @@ opal_shmem_mmap_module_t opal_shmem_mmap_module = {
static inline void
shmem_ds_reset(opal_shmem_ds_t *ds_buf)
{
/* don't print ds_buf info here, as we may be printing garbage. */
OPAL_OUTPUT_VERBOSE(
(70, opal_shmem_base_output,
"%s: %s: shmem_ds_resetting "
"(id: %d, size: %lu, name: %s)\n",
"%s: %s: shmem_ds_resetting\n",
mca_shmem_mmap_component.super.base_version.mca_type_name,
mca_shmem_mmap_component.super.base_version.mca_component_name,
ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
mca_shmem_mmap_component.super.base_version.mca_component_name)
);
ds_buf->seg_cpid = 0;

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
@ -108,13 +108,12 @@ opal_shmem_posix_module_t opal_shmem_posix_module = {
static inline void
shmem_ds_reset(opal_shmem_ds_t *ds_buf)
{
/* don't print ds_buf info here, as we may be printing garbage. */
OPAL_OUTPUT_VERBOSE(
(70, opal_shmem_base_output,
"%s: %s: shmem_ds_resetting "
"(id: %d, size: %lu, name: %s)\n",
"%s: %s: shmem_ds_resetting\n",
mca_shmem_posix_component.super.base_version.mca_type_name,
mca_shmem_posix_component.super.base_version.mca_component_name,
ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
mca_shmem_posix_component.super.base_version.mca_component_name)
);
ds_buf->seg_cpid = 0;

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -33,6 +33,13 @@
#include "opal_config.h"
#ifdef HAVE_STDDEF_H
#include <stddef.h>
#endif /* HAVE_STDDEF_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
BEGIN_C_DECLS
/* ////////////////////////////////////////////////////////////////////////// */
@ -92,7 +99,6 @@ do { \
#define OPAL_SHMEM_DS_IS_VALID(ds_buf) \
( (ds_buf)->flags & OPAL_SHMEM_DS_FLAGS_VALID )
/* ////////////////////////////////////////////////////////////////////////// */
typedef uint8_t opal_shmem_ds_flag_t;
/* shared memory segment header */
@ -113,13 +119,35 @@ struct opal_shmem_ds_t {
int seg_id;
/* size of shared memory segment */
size_t seg_size;
/* path to backing store */
char seg_name[OPAL_PATH_MAX];
/* base address of shared memory segment */
unsigned char *seg_base_addr;
/* path to backing store -- last element so we can easily calculate the
* "real" size of opal_shmem_ds_t. that is, the amount of the struct that
* is actually being used. for example: if seg_name is something like:
* "foo_baz" and OPAL_PATH_MAX is 4096, we want to know that only a very
* limited amount of the seg_name buffer is actually being used.
*/
char seg_name[OPAL_PATH_MAX];
};
typedef struct opal_shmem_ds_t opal_shmem_ds_t;
/* ////////////////////////////////////////////////////////////////////////// */
/**
* Simply returns the amount of used space. For use when sending the entire
* opal_shmem_ds_t payload isn't viable -- due to the potential disparity
* between the reserved buffer space and what is actually in use.
*/
static inline size_t
opal_shmem_sizeof_shmem_ds(const opal_shmem_ds_t *ds_bufp)
{
char *name_base = NULL;
size_t name_buf_offset = offsetof(opal_shmem_ds_t, seg_name);
name_base = (char *)ds_bufp + name_buf_offset;
return name_buf_offset + strlen(name_base) + 1;
}
END_C_DECLS
#endif /* OPAL_SHMEM_TYPES_H */

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
@ -113,13 +113,12 @@ opal_shmem_sysv_module_t opal_shmem_sysv_module = {
static inline void
shmem_ds_reset(opal_shmem_ds_t *ds_buf)
{
/* don't print ds_buf info here, as we may be printing garbage. */
OPAL_OUTPUT_VERBOSE(
(70, opal_shmem_base_output,
"%s: %s: shmem_ds_resetting "
"(id: %d, size: %lu, name: %s)\n",
"%s: %s: shmem_ds_resetting\n",
mca_shmem_sysv_component.super.base_version.mca_type_name,
mca_shmem_sysv_component.super.base_version.mca_component_name,
ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
mca_shmem_sysv_component.super.base_version.mca_component_name)
);
ds_buf->seg_cpid = 0;
@ -195,7 +194,7 @@ segment_create(opal_shmem_ds_t *ds_buf,
* real_size here
*/
if (-1 == (ds_buf->seg_id = shmget(IPC_PRIVATE, real_size,
IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) {
IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) {
int err = errno;
char hn[MAXHOSTNAMELEN];
gethostname(hn, MAXHOSTNAMELEN - 1);

Просмотреть файл

@ -114,13 +114,12 @@ opal_shmem_windows_module_t opal_shmem_windows_module = {
static inline void
shmem_ds_reset(opal_shmem_ds_t *ds_buf)
{
/* don't print ds_buf info here, as we may be printing garbage. */
OPAL_OUTPUT_VERBOSE(
(70, opal_shmem_base_output,
"%s: %s: shmem_ds_resetting "
"(id: %d, size: %"PRIsize_t", name: %s)\n",
"%s: %s: shmem_ds_resetting\n",
mca_shmem_windows_component.super.base_version.mca_type_name,
mca_shmem_windows_component.super.base_version.mca_component_name,
ds_buf->seg_id, ds_buf->seg_size, ds_buf->seg_name)
mca_shmem_windows_component.super.base_version.mca_component_name)
);
ds_buf->seg_cpid = 0;