
NOTE: mmap is still the default. Some highlights: o Silent component failover. o The sysv component will only be queried for selection if it is placed before the mmap component (for example, -mca mpi_common_sm sysv,posix,mmap). In the default case, sysv will never be queried/selected. o Per some on-list discussion, now unlinking mmaped file in both mmap and posix components (see: "System V Shared Memory for Open MPI: Request for Community Input and Testing" thread). o Assuming local process homogeneity with respect to all utilized shared memory facilities. That is, if one local process deems a particular shared memory facility acceptable, then ALL local processes should be able to utilize that facility. As it stands, this is an important point because one process dictates to all other local processes which common sm component will be selected based on its own, local run-time test. o Addressed some of George's code reuse concerns. This commit was SVN r23633.
505 строки
16 KiB
C
505 строки
16 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2010 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif /* HAVE_STRING_H */
|
|
#ifdef HAVE_FCNTL_H
|
|
#include <fcntl.h>
|
|
#endif /* HAVE_FCNTL_H */
|
|
#ifdef HAVE_TIME_H
|
|
#include <time.h>
|
|
#endif /* HAVE_TIME_H */
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h>
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
#ifdef HAVE_SYS_MMAN_H
|
|
#include <sys/mman.h>
|
|
#endif
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/path.h"
|
|
#include "opal/align.h"
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/util/opal_sos.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "ompi/constants.h"
|
|
#include "ompi/proc/proc.h"
|
|
#include "ompi/mca/dpm/dpm.h"
|
|
#include "ompi/mca/mpool/sm/mpool_sm.h"
|
|
|
|
#include "common_sm_rml.h"
|
|
#include "common_sm_posix.h"
|
|
|
|
/* max number of attempts to find an available
|
|
* shm_open file name. see comments below.
|
|
*/
|
|
#define OMPI_COMMON_SM_POSIX_MAX_ATTEMPTS 64
|
|
/* need the / for Solaris 10 and others, i'm sure */
|
|
#define OMPI_COMMON_SM_POSIX_FILE_NAME_PREFIX "/open_mpi."
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
mca_common_sm_module_posix_t,
|
|
opal_object_t,
|
|
NULL,
|
|
NULL
|
|
);
|
|
|
|
/**
|
|
* lock to protect multiple instances of posix_init() from being
|
|
* invoked simultaneously (because of rml usage).
|
|
*/
|
|
static opal_mutex_t mutex;
|
|
|
|
/**
|
|
* shared memory information used for initialization and setup.
|
|
*/
|
|
static mca_common_sm_rml_sm_info_t sm_info;
|
|
|
|
/**
|
|
* list of RML messages that have arrived that have not yet been
|
|
* consumed by the thread who is looking to complete its component
|
|
* initialization based on the contents of the RML message.
|
|
*/
|
|
static opal_list_t pending_rml_msgs;
|
|
static bool pending_rml_msgs_init = false;
|
|
|
|
/******************************************************************************/
|
|
/**
|
|
* this routine searches for an available shm_open file name.
|
|
*
|
|
* @return if successful, a non-negative file descriptor is returned and
|
|
* posix_file_name_buff will contain the file name associated with the
|
|
* successful shm_open. otherwise, -1 is returned and the contents of
|
|
* posix_file_name_buff is undefined.
|
|
*/
|
|
static int
|
|
posix_shm_open(char *posix_file_name_buff, size_t size)
|
|
{
|
|
int attempt = 0;
|
|
int fd = -1;
|
|
/* format: /open_mpi.nnnn
|
|
* see comment in common_sm.h that explains
|
|
* why we chose to do things this way.
|
|
*/
|
|
snprintf(posix_file_name_buff,
|
|
size,
|
|
"%s%04d",
|
|
OMPI_COMMON_SM_POSIX_FILE_NAME_PREFIX,
|
|
attempt++);
|
|
/**
|
|
* workaround for simultaneous posix shm_opens on the same node (e.g.
|
|
* multiple Open MPI jobs sharing a node). name collision during
|
|
* component runtime will happen, so protect against it.
|
|
*/
|
|
while (attempt < OMPI_COMMON_SM_POSIX_MAX_ATTEMPTS)
|
|
{
|
|
/* the check for the existence of the object and its
|
|
* creation if it does not exist are performed atomically.
|
|
*/
|
|
if ((fd = shm_open(posix_file_name_buff,
|
|
O_CREAT | O_EXCL | O_RDWR,
|
|
0600)) < 0)
|
|
{
|
|
int err = errno;
|
|
if (EEXIST == err)
|
|
{
|
|
/* try again with a different name */
|
|
snprintf(posix_file_name_buff,
|
|
size,
|
|
"%s%04d",
|
|
OMPI_COMMON_SM_POSIX_FILE_NAME_PREFIX,
|
|
attempt++);
|
|
continue;
|
|
}
|
|
else /* a "real" error occurred, notify the user and return -1 */
|
|
{
|
|
orte_show_help("help-mpi-common-sm.txt",
|
|
"sys call fail",
|
|
1,
|
|
orte_process_info.nodename,
|
|
"shm_open(2)",
|
|
posix_file_name_buff,
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
strerror(err),
|
|
err);
|
|
fd = -1;
|
|
break;
|
|
}
|
|
}
|
|
else /* success! */
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if (OMPI_COMMON_SM_POSIX_MAX_ATTEMPTS <= attempt)
|
|
{
|
|
MCA_COMMON_SM_OUTPUT_VERBOSE("max attempts exceeded: could not find an "
|
|
"available posix shared object file name");
|
|
}
|
|
return fd;
|
|
}
|
|
|
|
/******************************************************************************/
|
|
static mca_common_sm_module_posix_t *
|
|
create_map(int fd,
|
|
size_t size,
|
|
size_t size_ctl_structure,
|
|
size_t data_seg_alignment)
|
|
{
|
|
unsigned char *addr = NULL;
|
|
mca_common_sm_module_posix_t *map = NULL;
|
|
mca_common_sm_seg_header_t *seg = NULL;
|
|
|
|
/* map the file and initialize segment state */
|
|
if (MAP_FAILED == (seg = (mca_common_sm_seg_header_t *)
|
|
mmap(NULL,
|
|
size,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_SHARED,
|
|
fd,
|
|
0)))
|
|
{
|
|
int err = errno;
|
|
orte_show_help("help-mpi-common-sm.txt",
|
|
"sys call fail",
|
|
1,
|
|
orte_process_info.nodename,
|
|
"mmap(2)",
|
|
"",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
strerror(err),
|
|
err);
|
|
return NULL;
|
|
}
|
|
|
|
/* set up the map object */
|
|
map = OBJ_NEW(mca_common_sm_module_posix_t);
|
|
|
|
/**
|
|
* the first entry in the file is the control structure. the first entry
|
|
* in the control structure is an mca_common_sm_seg_header_t element
|
|
*/
|
|
map->super.module_seg = seg;
|
|
|
|
addr = ((unsigned char *)seg) + size_ctl_structure;
|
|
/**
|
|
* if we have a data segment (i.e., if 0 != data_seg_alignment),
|
|
* then make it the first aligned address after the control
|
|
* structure. if this happens, this is a programming error in
|
|
* Open MPI!
|
|
*/
|
|
if (0 != data_seg_alignment)
|
|
{
|
|
addr = OPAL_ALIGN_PTR(addr, data_seg_alignment, unsigned char *);
|
|
|
|
/* is addr past end of shared memory object ? */
|
|
if ((unsigned char *)seg + size < addr)
|
|
{
|
|
orte_show_help("help-mpi-common-sm.txt",
|
|
"mmap too small",
|
|
1,
|
|
orte_process_info.nodename,
|
|
(unsigned long)size,
|
|
(unsigned long)size_ctl_structure,
|
|
(unsigned long)data_seg_alignment);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
map->super.module_data_addr = addr;
|
|
map->super.module_seg_addr = (unsigned char *)seg;
|
|
map->super.module_size = size;
|
|
|
|
/* map object successful initialized - we can safely increment seg_att */
|
|
opal_atomic_wmb();
|
|
opal_atomic_add_32(&map->super.module_seg->seg_att, 1);
|
|
|
|
return map;
|
|
}
|
|
|
|
/******************************************************************************/
|
|
/**
|
|
* this routine performs the posix sm component run-time test.
|
|
*
|
|
* @return OMPI_SUCCESS if posix sm can be used, OMPI_ERR_NOT_SUPPORTED
|
|
* otherwise.
|
|
*/
|
|
int
|
|
mca_common_sm_posix_component_query(void)
|
|
{
|
|
int rc = OMPI_SUCCESS;
|
|
int fd = -1;
|
|
|
|
if (-1 == (fd = posix_shm_open(sm_info.posix_fname_buff,
|
|
(OMPI_COMMON_SM_POSIX_FILE_LEN_MAX - 1))))
|
|
{
|
|
rc = OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
if (-1 != fd)
|
|
{
|
|
shm_unlink(sm_info.posix_fname_buff);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/******************************************************************************/
|
|
/**
|
|
* this routine assumes that sorted_procs is in the following state:
|
|
* o all the local procs at the beginning.
|
|
* o sorted_procs[0] is the lowest named process.
|
|
*/
|
|
mca_common_sm_module_t *
|
|
mca_common_sm_posix_init(ompi_proc_t **sorted_procs,
|
|
size_t num_local_procs,
|
|
size_t size,
|
|
char *file_name,
|
|
size_t size_ctl_structure,
|
|
size_t data_seg_alignment)
|
|
{
|
|
int fd = -1;
|
|
mca_common_sm_module_posix_t *map = NULL;
|
|
bool lowest;
|
|
size_t mem_offset;
|
|
int n_local_procs;
|
|
|
|
lowest = (0 == orte_util_compare_name_fields(
|
|
ORTE_NS_CMP_ALL,
|
|
ORTE_PROC_MY_NAME,
|
|
&(sorted_procs[0]->proc_name)));
|
|
|
|
/* using sm_info.id as an initialization marker:
|
|
* o 0 -> not initialized; 1 -> initialized
|
|
*/
|
|
sm_info.id = 0;
|
|
memset(sm_info.posix_fname_buff, '\0', OMPI_COMMON_SM_POSIX_FILE_LEN_MAX);
|
|
|
|
/**
|
|
* lock here to prevent multiple threads from invoking this function
|
|
* simultaneously. the critical section we're protecting is usage of
|
|
* the RML in this block.
|
|
*/
|
|
opal_mutex_lock(&mutex);
|
|
|
|
if (!pending_rml_msgs_init)
|
|
{
|
|
OBJ_CONSTRUCT(&(pending_rml_msgs), opal_list_t);
|
|
pending_rml_msgs_init = true;
|
|
}
|
|
|
|
/**
|
|
* figure out if i am the lowest proc in the group.
|
|
* if i am, initialize the shared memory object.
|
|
*/
|
|
if (lowest)
|
|
{
|
|
/* initialize POSIX shared memory object */
|
|
if (-1 == (fd = posix_shm_open(sm_info.posix_fname_buff,
|
|
(OMPI_COMMON_SM_POSIX_FILE_LEN_MAX - 1))))
|
|
{
|
|
/* do nothing. if a real error occurred or the file name search
|
|
* limit was reached, posix_shm_open will take care of the
|
|
* notification part.
|
|
*/
|
|
;
|
|
}
|
|
else if (0 != ftruncate(fd, size))
|
|
{
|
|
int err = errno;
|
|
orte_show_help("help-mpi-common-sm.txt",
|
|
"sys call fail",
|
|
1,
|
|
orte_process_info.nodename,
|
|
"ftruncate(2)",
|
|
"",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
strerror(err),
|
|
err);
|
|
shm_unlink(sm_info.posix_fname_buff);
|
|
}
|
|
else
|
|
{
|
|
map = create_map(fd,
|
|
size,
|
|
size_ctl_structure,
|
|
data_seg_alignment);
|
|
if (NULL != map)
|
|
{
|
|
sm_info.id = 1;
|
|
/* initialize the segment */
|
|
mem_offset =
|
|
map->super.module_data_addr -
|
|
(unsigned char *)map->super.module_seg;
|
|
map->super.module_seg->seg_offset = mem_offset;
|
|
map->super.module_seg->seg_size = size - mem_offset;
|
|
#if 0 /* i don't think this unlock is needed, but it's in mmap's source */
|
|
opal_atomic_unlock(&map->super.module_seg->seg_lock);
|
|
#endif
|
|
map->super.module_seg->seg_inited = 0;
|
|
}
|
|
else
|
|
{
|
|
shm_unlink(sm_info.posix_fname_buff);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* signal the rest of the local procs that a
|
|
* new shared memory object has been created.
|
|
*/
|
|
if (OMPI_SUCCESS != mca_common_sm_rml_info_bcast(
|
|
&sm_info,
|
|
sorted_procs,
|
|
num_local_procs,
|
|
OMPI_RML_TAG_SM_BACK_FILE_CREATED,
|
|
lowest,
|
|
file_name,
|
|
&(pending_rml_msgs)))
|
|
{
|
|
goto out;
|
|
}
|
|
|
|
if (!lowest)
|
|
{
|
|
/* make certain that things were initialized correctly */
|
|
if (0 != sm_info.id)
|
|
{
|
|
if ((fd = shm_open(sm_info.posix_fname_buff, O_RDWR, 0600)) > 0)
|
|
{
|
|
map = create_map(fd,
|
|
size,
|
|
size_ctl_structure,
|
|
data_seg_alignment);
|
|
}
|
|
}
|
|
}
|
|
/* if all things were initialized properly, wait until all other local
|
|
* procs have reported in before calling shm_unlink
|
|
*/
|
|
else
|
|
{
|
|
if (1 == sm_info.id)
|
|
{
|
|
n_local_procs = (int)num_local_procs;
|
|
while (n_local_procs > map->super.module_seg->seg_att);
|
|
{
|
|
opal_atomic_rmb();
|
|
}
|
|
/* all other local procs reported in, so it's safe to shm_unlink */
|
|
shm_unlink(sm_info.posix_fname_buff);
|
|
}
|
|
}
|
|
|
|
out:
|
|
opal_mutex_unlock(&mutex);
|
|
|
|
return &(map->super);
|
|
}
|
|
|
|
/******************************************************************************/
|
|
int
|
|
mca_common_sm_posix_fini(mca_common_sm_module_t *mca_common_sm_module)
|
|
{
|
|
/* no need for shm_unlink here because it was already taken care of */
|
|
mca_common_sm_module_posix_t *posix_module =
|
|
(mca_common_sm_module_posix_t *)mca_common_sm_module;
|
|
int rc = OMPI_SUCCESS;
|
|
|
|
if (NULL != posix_module->super.module_seg)
|
|
{
|
|
rc = munmap((void *)posix_module->super.module_seg_addr,
|
|
posix_module->super.module_size);
|
|
posix_module->super.module_seg_addr = NULL;
|
|
posix_module->super.module_size = 0;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/******************************************************************************/
|
|
/**
|
|
* allocate memory from a previously allocated shared memory
|
|
* block.
|
|
*
|
|
* @param size size of request, in bytes (IN)
|
|
*
|
|
* @retval addr virtual address
|
|
*/
|
|
|
|
void *
|
|
mca_common_sm_posix_seg_alloc(struct mca_mpool_base_module_t* mpool,
|
|
size_t* size,
|
|
mca_mpool_base_registration_t** registration)
|
|
{
|
|
mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t *) mpool;
|
|
mca_common_sm_module_posix_t *map =
|
|
(mca_common_sm_module_posix_t *)sm_module->sm_common_module;
|
|
mca_common_sm_seg_header_t *seg = map->super.module_seg;
|
|
void *addr;
|
|
|
|
opal_atomic_lock(&seg->seg_lock);
|
|
if(seg->seg_offset + *size > seg->seg_size)
|
|
{
|
|
addr = NULL;
|
|
}
|
|
else
|
|
{
|
|
size_t fixup;
|
|
|
|
/* add base address to segment offset */
|
|
addr = map->super.module_data_addr + seg->seg_offset;
|
|
seg->seg_offset += *size;
|
|
|
|
/**
|
|
* fix up seg_offset so next allocation is aligned on a
|
|
* sizeof(long) boundry. Do it here so that we don't have to
|
|
* check before checking remaining size in buffer
|
|
*/
|
|
if ((fixup = (seg->seg_offset & (sizeof(long) - 1))) > 0)
|
|
{
|
|
seg->seg_offset += sizeof(long) - fixup;
|
|
}
|
|
}
|
|
if (NULL != registration)
|
|
{
|
|
*registration = NULL;
|
|
}
|
|
opal_atomic_unlock(&seg->seg_lock);
|
|
return addr;
|
|
}
|
|
|