dadca7da88
This merge adds Checkpoint/Restart support to Open MPI. The initial frameworks and components support a LAM/MPI-like implementation. This commit follows the risk assessment presented to the Open MPI core development group on Feb. 22, 2007. This commit closes trac:158 More details to follow. This commit was SVN r14051. The following SVN revisions from the original message are invalid or inconsistent and therefore were not cross-referenced: r13912 The following Trac tickets were found above: Ticket 158 --> https://svn.open-mpi.org/trac/ompi/ticket/158
217 строки
7.5 KiB
C
217 строки
7.5 KiB
C
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#if HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H*/
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "ompi/mca/allocator/base/base.h"
|
|
#include "mpool_sm.h"
|
|
#include "ompi/mca/common/sm/common_sm_mmap.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/util/sys_info.h"
|
|
#include "ompi/proc/proc.h"
|
|
|
|
/*
|
|
* Local functions
|
|
*/
|
|
static int mca_mpool_sm_open(void);
|
|
static int mca_mpool_sm_close( void );
|
|
static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|
struct mca_mpool_base_resources_t* resources);
|
|
|
|
mca_mpool_sm_component_t mca_mpool_sm_component = {
|
|
{
|
|
/* First, the mca_base_component_t struct containing meta
|
|
information about the component itself */
|
|
|
|
{
|
|
/* Indicate that we are a mpool v1.0.0 component (which also
|
|
implies a specific MCA version) */
|
|
|
|
MCA_MPOOL_BASE_VERSION_1_0_0,
|
|
|
|
"sm", /* MCA component name */
|
|
OMPI_MAJOR_VERSION, /* MCA component major version */
|
|
OMPI_MINOR_VERSION, /* MCA component minor version */
|
|
OMPI_RELEASE_VERSION, /* MCA component release version */
|
|
mca_mpool_sm_open, /* component open */
|
|
mca_mpool_sm_close
|
|
},
|
|
|
|
/* Next the MCA v1.0.0 component meta data */
|
|
|
|
{
|
|
/* The component is not checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_NONE
|
|
},
|
|
|
|
mca_mpool_sm_init
|
|
}
|
|
};
|
|
|
|
static int max_size_param, min_size_param, peer_size_param;
|
|
|
|
|
|
/**
|
|
* component open/close/init function
|
|
*/
|
|
static int mca_mpool_sm_open(void)
|
|
{
|
|
/* register SM component parameters */
|
|
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
|
|
"allocator",
|
|
"Name of allocator component to use with sm mpool",
|
|
false, false,
|
|
"bucket",
|
|
&mca_mpool_sm_component.sm_allocator_name);
|
|
|
|
max_size_param =
|
|
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
|
"max_size",
|
|
"Maximum size of the sm mpool shared memory file",
|
|
false, false, 512 * 1024 * 1024, NULL);
|
|
|
|
min_size_param =
|
|
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
|
"min_size",
|
|
"Minimum size of the sm mpool shared memory file",
|
|
false, false, 128 * 1024 * 1024, NULL);
|
|
|
|
peer_size_param =
|
|
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
|
"per_peer_size",
|
|
"Size (in bytes) to allocate per local peer in "
|
|
"the sm mpool shared memory file, bounded by "
|
|
"min_size and max_size",
|
|
false, false, 32 * 1024 * 1024, NULL);
|
|
|
|
mca_mpool_sm_component.sm_size = 0;
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int mca_mpool_sm_close( void )
|
|
{
|
|
if( NULL != mca_common_sm_mmap ) {
|
|
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
|
|
unlink( mca_common_sm_mmap->map_path );
|
|
}
|
|
OBJ_RELEASE( mca_common_sm_mmap );
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|
struct mca_mpool_base_resources_t* resources)
|
|
{
|
|
char *file_name;
|
|
int len;
|
|
mca_mpool_sm_module_t* mpool_module;
|
|
mca_allocator_base_component_t* allocator_component;
|
|
int max_size, min_size, peer_size;
|
|
ompi_proc_t **procs;
|
|
size_t num_all_procs, i, num_local_procs = 0;
|
|
|
|
/* determine size of shared memory file */
|
|
mca_base_param_lookup_int(max_size_param, &max_size);
|
|
mca_base_param_lookup_int(min_size_param, &min_size);
|
|
mca_base_param_lookup_int(peer_size_param, &peer_size);
|
|
|
|
/* README: this needs to change if procs in different jobs (even
|
|
spawned ones) are to talk using shared memory */
|
|
procs = ompi_proc_world(&num_all_procs);
|
|
for (i = 0 ; i < num_all_procs ; ++i) {
|
|
if (procs[i]->proc_flags & OMPI_PROC_FLAG_LOCAL) {
|
|
num_local_procs++;
|
|
}
|
|
}
|
|
|
|
if (min_size > max_size) {
|
|
opal_output(0, "mca_mpool_sm_init: adjusting max_size to be min_size (%d)",
|
|
min_size);
|
|
max_size = min_size;
|
|
}
|
|
|
|
/* set sm_size based on num_procs, then adjust from there */
|
|
mca_mpool_sm_component.sm_size = peer_size * num_local_procs;
|
|
if ((size_t) min_size > mca_mpool_sm_component.sm_size) {
|
|
mca_mpool_sm_component.sm_size = min_size;
|
|
}
|
|
if ((size_t) max_size < mca_mpool_sm_component.sm_size) {
|
|
mca_mpool_sm_component.sm_size = max_size;
|
|
}
|
|
|
|
allocator_component = mca_allocator_component_lookup(
|
|
mca_mpool_sm_component.sm_allocator_name);
|
|
|
|
/* if specified allocator cannout be loaded - look for an alternative */
|
|
if(NULL == allocator_component) {
|
|
if(opal_list_get_size(&mca_allocator_base_components) == 0) {
|
|
mca_base_component_list_item_t* item = (mca_base_component_list_item_t*)
|
|
opal_list_get_first(&mca_allocator_base_components);
|
|
allocator_component = (mca_allocator_base_component_t*)item->cli_component;
|
|
opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s - using %s\n",
|
|
mca_mpool_sm_component.sm_allocator_name, allocator_component->allocator_version.mca_component_name);
|
|
} else {
|
|
opal_output(0, "mca_mpool_sm_init: unable to locate allocator: %s\n",
|
|
mca_mpool_sm_component.sm_allocator_name);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
mpool_module = (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t));
|
|
mca_mpool_sm_module_init(mpool_module);
|
|
|
|
/* create initial shared memory mapping */
|
|
len = asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
|
orte_process_info.job_session_dir,
|
|
orte_system_info.nodename );
|
|
if ( 0 > len ) {
|
|
return NULL;
|
|
}
|
|
|
|
if(NULL ==
|
|
(mca_common_sm_mmap =
|
|
mca_common_sm_mmap_init(mca_mpool_sm_component.sm_size,
|
|
file_name,sizeof(mca_common_sm_mmap_t), 8 )
|
|
))
|
|
{
|
|
opal_output(0, "mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name);
|
|
free(file_name);
|
|
return NULL;
|
|
}
|
|
free(file_name);
|
|
|
|
/* setup allocator */
|
|
mpool_module->sm_allocator =
|
|
allocator_component->allocator_init(true,
|
|
mca_common_sm_mmap_seg_alloc, NULL, NULL);
|
|
if(NULL == mpool_module->sm_allocator) {
|
|
opal_output(0, "mca_mpool_sm_init: unable to initialize allocator");
|
|
return NULL;
|
|
}
|
|
|
|
return &mpool_module->super;
|
|
}
|
|
|