
NOTE: mmap is still the default. Some highlights: o Silent component failover. o The sysv component will only be queried for selection if it is placed before the mmap component (for example, -mca mpi_common_sm sysv,posix,mmap). In the default case, sysv will never be queried/selected. o Per some on-list discussion, now unlinking mmaped file in both mmap and posix components (see: "System V Shared Memory for Open MPI: Request for Community Input and Testing" thread). o Assuming local process homogeneity with respect to all utilized shared memory facilities. That is, if one local process deems a particular shared memory facility acceptable, then ALL local processes should be able to utilize that facility. As it stands, this is an important point because one process dictates to all other local processes which common sm component will be selected based on its own, local run-time test. o Addressed some of George's code reuse concerns. This commit was SVN r23633.
358 строки
11 KiB
C
358 строки
11 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2010 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif /* HAVE_STRING_H */
|
|
#ifdef HAVE_FCNTL_H
|
|
#include <fcntl.h>
|
|
#endif /* HAVE_FCNTL_H */
|
|
#ifdef HAVE_TIME_H
|
|
#include <time.h>
|
|
#endif /* HAVE_TIME_H */
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h>
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
#ifdef HAVE_SYS_MMAN_H
|
|
#include <sys/mman.h>
|
|
#endif
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/path.h"
|
|
#include "opal/align.h"
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/util/opal_sos.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "ompi/constants.h"
|
|
#include "ompi/proc/proc.h"
|
|
#include "ompi/mca/dpm/dpm.h"
|
|
#include "ompi/mca/mpool/sm/mpool_sm.h"
|
|
|
|
#include "common_sm_rml.h"
|
|
#include "common_sm_mmap.h"
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
mca_common_sm_module_mmap_t,
|
|
opal_object_t,
|
|
NULL,
|
|
NULL
|
|
);
|
|
|
|
/**
|
|
* list of RML messages that have arrived that have not yet been
|
|
* consumed by the thread who is looking to complete its component
|
|
* initialization based on the contents of the RML message.
|
|
*/
|
|
static opal_list_t pending_rml_msgs;
|
|
static bool pending_rml_msgs_init = false;
|
|
|
|
/*
|
|
* Lock to protect multiple instances of mmap_init() from being
|
|
* invoked simultaneously (because of RML usage).
|
|
*/
|
|
static opal_mutex_t mutex;
|
|
|
|
/**
|
|
* shared memory information used for initialization and setup.
|
|
*/
|
|
static mca_common_sm_rml_sm_info_t sm_info;
|
|
|
|
static mca_common_sm_module_mmap_t *
|
|
create_map(int fd, size_t size,
|
|
char *file_name,
|
|
size_t size_ctl_structure,
|
|
size_t data_seg_alignment)
|
|
{
|
|
mca_common_sm_module_mmap_t *map;
|
|
mca_common_sm_seg_header_t *seg;
|
|
unsigned char *addr = NULL;
|
|
|
|
/* map the file and initialize segment state */
|
|
seg = (mca_common_sm_seg_header_t *)
|
|
mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
|
|
if (MAP_FAILED == seg) {
|
|
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
|
|
orte_process_info.nodename,
|
|
"mmap(2)", "", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
strerror(errno), errno);
|
|
return NULL;
|
|
}
|
|
|
|
/* set up the map object */
|
|
map = OBJ_NEW(mca_common_sm_module_mmap_t);
|
|
strncpy(map->super.module_seg_path, file_name, OPAL_PATH_MAX);
|
|
/* the first entry in the file is the control structure. The first
|
|
entry in the control structure is an mca_common_sm_seg_header_t
|
|
element */
|
|
map->super.module_seg = seg;
|
|
|
|
addr = ((unsigned char *)seg) + size_ctl_structure;
|
|
/* If we have a data segment (i.e., if 0 != data_seg_alignment),
|
|
then make it the first aligned address after the control
|
|
structure. IF THIS HAPPENS, THIS IS A PROGRAMMING ERROR IN
|
|
OPEN MPI!*/
|
|
if (0 != data_seg_alignment) {
|
|
addr = OPAL_ALIGN_PTR(addr, data_seg_alignment, unsigned char*);
|
|
|
|
/* is addr past end of file ? */
|
|
if((unsigned char*)seg + size < addr) {
|
|
orte_show_help("help-mpi-common-sm.txt", "mmap too small", 1,
|
|
orte_process_info.nodename,
|
|
(unsigned long) size,
|
|
(unsigned long) size_ctl_structure,
|
|
(unsigned long) data_seg_alignment);
|
|
return NULL;
|
|
}
|
|
}
|
|
map->super.module_data_addr = addr;
|
|
map->super.module_seg_addr = (unsigned char *)seg;
|
|
map->super.module_size = size;
|
|
|
|
/* map object successful initialized - we can safely increment seg_att */
|
|
opal_atomic_wmb();
|
|
opal_atomic_add_32(&map->super.module_seg->seg_att, 1);
|
|
|
|
return map;
|
|
}
|
|
|
|
/******************************************************************************/
|
|
/**
|
|
* mca_common_sm_mmap_component_query
|
|
*/
|
|
int
|
|
mca_common_sm_mmap_component_query(void)
|
|
{
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
mca_common_sm_module_t *
|
|
mca_common_sm_mmap_init(ompi_proc_t **sorted_procs,
|
|
size_t num_loc_procs,
|
|
size_t size, char *file_name,
|
|
size_t size_ctl_structure,
|
|
size_t data_seg_alignment)
|
|
{
|
|
int fd = -1;
|
|
bool lowest;
|
|
mca_common_sm_module_mmap_t *map = NULL;
|
|
size_t mem_offset;
|
|
int num_local_procs;
|
|
|
|
lowest = (0 == orte_util_compare_name_fields(
|
|
ORTE_NS_CMP_ALL,
|
|
ORTE_PROC_MY_NAME,
|
|
&(sorted_procs[0]->proc_name)));
|
|
|
|
/* using sm_info.id as an initialization marker:
|
|
* o 0 -> not initialized; 1 -> initialized
|
|
*/
|
|
sm_info.id = 0;
|
|
memset(sm_info.posix_fname_buff, '\0', OMPI_COMMON_SM_POSIX_FILE_LEN_MAX);
|
|
/**
|
|
* remember that this function was passed
|
|
* a sorted procs array and a local proc count.
|
|
*/
|
|
num_local_procs = num_loc_procs;
|
|
|
|
/* Lock here to prevent multiple threads from invoking this
|
|
function simultaneously. The critical section we're protecting
|
|
is usage of the RML in this block. */
|
|
opal_mutex_lock(&mutex);
|
|
|
|
if (!pending_rml_msgs_init)
|
|
{
|
|
OBJ_CONSTRUCT(&(pending_rml_msgs), opal_list_t);
|
|
pending_rml_msgs_init = true;
|
|
}
|
|
|
|
/* Figure out if I am the lowest rank in the group. If so, I will
|
|
create the shared file. */
|
|
if (lowest) {
|
|
/* check, whether the specified filename is on a network file system */
|
|
if (opal_path_nfs(file_name)) {
|
|
orte_show_help("help-mpi-common-sm.txt", "mmap on nfs", 1,
|
|
orte_process_info.nodename, file_name);
|
|
}
|
|
/* process initializing the file */
|
|
fd = open(file_name, O_CREAT|O_RDWR, 0600);
|
|
if (fd < 0) {
|
|
int err = errno;
|
|
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
|
|
orte_process_info.nodename,
|
|
"open(2)", file_name, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
strerror(err), err);
|
|
} else if (ftruncate(fd, size) != 0) {
|
|
int err = errno;
|
|
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
|
|
orte_process_info.nodename,
|
|
"ftruncate(2)", "", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
strerror(err), err);
|
|
close(fd);
|
|
unlink(file_name);
|
|
fd = -1;
|
|
} else {
|
|
map = create_map(fd, size, file_name, size_ctl_structure,
|
|
data_seg_alignment);
|
|
if (map != NULL) {
|
|
sm_info.id = 1;
|
|
|
|
/* initialize the segment - only the first process
|
|
to open the file */
|
|
mem_offset =
|
|
map->super.module_data_addr -
|
|
(unsigned char *)map->super.module_seg;
|
|
map->super.module_seg->seg_offset = mem_offset;
|
|
map->super.module_seg->seg_size = size - mem_offset;
|
|
opal_atomic_unlock(&map->super.module_seg->seg_lock);
|
|
map->super.module_seg->seg_inited = 0;
|
|
} else {
|
|
close(fd);
|
|
unlink(file_name);
|
|
fd = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Signal the rest of the local procs that the backing file
|
|
has been created. */
|
|
if (OMPI_SUCCESS != mca_common_sm_rml_info_bcast(
|
|
&sm_info,
|
|
sorted_procs,
|
|
num_local_procs,
|
|
OMPI_RML_TAG_SM_BACK_FILE_CREATED,
|
|
lowest,
|
|
file_name,
|
|
&(pending_rml_msgs))) {
|
|
goto out;
|
|
}
|
|
|
|
if (lowest)
|
|
{
|
|
if (1 == sm_info.id)
|
|
{
|
|
/* wait until all other local procs have reported in */
|
|
while (num_local_procs > map->super.module_seg->seg_att);
|
|
{
|
|
opal_atomic_rmb();
|
|
}
|
|
/**
|
|
* all other local procs reported in, so it's safe to unlink
|
|
*/
|
|
unlink(file_name);
|
|
}
|
|
} else {
|
|
/* check to see if file initialized correctly */
|
|
if (sm_info.id != 0) {
|
|
fd = open(file_name, O_RDWR, 0600);
|
|
|
|
if (fd != -1) {
|
|
map = create_map(fd, size, file_name, size_ctl_structure,
|
|
data_seg_alignment);
|
|
}
|
|
}
|
|
}
|
|
|
|
out:
|
|
opal_mutex_unlock(&mutex);
|
|
|
|
if (fd != -1) {
|
|
close(fd);
|
|
}
|
|
|
|
return &(map->super);
|
|
}
|
|
|
|
int
|
|
mca_common_sm_mmap_fini(mca_common_sm_module_t *mca_common_sm_module)
|
|
{
|
|
mca_common_sm_module_mmap_t *mmap_module =
|
|
(mca_common_sm_module_mmap_t *)mca_common_sm_module;
|
|
int rc = OMPI_SUCCESS;
|
|
|
|
if( NULL != mmap_module->super.module_seg ) {
|
|
rc = munmap((void*) mmap_module->super.module_seg_addr,
|
|
mmap_module->super.module_size);
|
|
mmap_module->super.module_seg_addr = NULL;
|
|
mmap_module->super.module_size = 0;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* allocate memory from a previously allocated shared memory
|
|
* block.
|
|
*
|
|
* @param size size of request, in bytes (IN)
|
|
*
|
|
* @retval addr virtual address
|
|
*/
|
|
|
|
void *
|
|
mca_common_sm_mmap_seg_alloc(struct mca_mpool_base_module_t* mpool,
|
|
size_t* size,
|
|
mca_mpool_base_registration_t** registration)
|
|
{
|
|
mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t*) mpool;
|
|
mca_common_sm_module_mmap_t *map =
|
|
(mca_common_sm_module_mmap_t *)sm_module->sm_common_module;
|
|
mca_common_sm_seg_header_t* seg = map->super.module_seg;
|
|
void* addr;
|
|
|
|
opal_atomic_lock(&seg->seg_lock);
|
|
if(seg->seg_offset + *size > seg->seg_size) {
|
|
addr = NULL;
|
|
} else {
|
|
size_t fixup;
|
|
|
|
/* add base address to segment offset */
|
|
addr = map->super.module_data_addr + seg->seg_offset;
|
|
seg->seg_offset += *size;
|
|
|
|
/* fix up seg_offset so next allocation is aligned on a
|
|
sizeof(long) boundry. Do it here so that we don't have to
|
|
check before checking remaining size in buffer */
|
|
if ((fixup = (seg->seg_offset & (sizeof(long) - 1))) > 0) {
|
|
seg->seg_offset += sizeof(long) - fixup;
|
|
}
|
|
}
|
|
if (NULL != registration) {
|
|
*registration = NULL;
|
|
}
|
|
opal_atomic_unlock(&seg->seg_lock);
|
|
return addr;
|
|
}
|
|
|