From 2fb7c344fc2fa55275b179948b7b1a7b5ccca02e Mon Sep 17 00:00:00 2001 From: Samuel Gutierrez Date: Wed, 9 Jun 2010 16:58:52 +0000 Subject: [PATCH] Added a new System V (sysv) shared memory component for Open MPI. Configure Option: --enable-sysv MCA Parameter: mpi_common_sm mpi_common_sm accepts a comma delimited list of: [sysv],mmap (order dependent). The first component that is successfully selected is used. For example, -mca mpi_common_sm sysv,mmap will first try sysv. If sysv is not successfully selected, then mmap will be used. mmap will be used if mpi_common_sm is not provided. Notes: Please make certain that your system's shmmax limit, or equivalent, is larger than mpool_sm_min_size. Otherwise, shmget may fail. This commit was SVN r23260. --- AUTHORS | 1 + LICENSE | 2 +- README | 4 + .../CMakeModules/check_mca_subdirs.cmake | 15 +- .../CMakeModules/ompi_check_Microsoft.cmake | 3 + ompi/mca/btl/sm/btl_sm.c | 34 +- ompi/mca/btl/sm/btl_sm.h | 35 +- ompi/mca/btl/sm/btl_sm_component.c | 77 +- ompi/mca/coll/sm/coll_sm.h | 4 +- ompi/mca/coll/sm/coll_sm_module.c | 28 +- ompi/mca/common/sm/.windows | 2 +- ompi/mca/common/sm/Makefile.am | 18 + ompi/mca/common/sm/common_sm.c | 269 +++++++ ompi/mca/common/sm/common_sm.h | 192 +++++ ompi/mca/common/sm/common_sm_mmap.c | 260 ++----- ompi/mca/common/sm/common_sm_mmap.h | 92 +-- ompi/mca/common/sm/common_sm_sysv.c | 666 ++++++++++++++++++ ompi/mca/common/sm/common_sm_sysv.h | 145 ++++ ompi/mca/common/sm/common_sm_windows.c | 299 ++++++++ ompi/mca/common/sm/common_sm_windows.h | 135 ++++ ompi/mca/common/sm/configure.m4 | 73 ++ ompi/mca/common/sm/help-mpi-common-sm.txt | 23 + ompi/mca/mpool/sm/mpool_sm.h | 6 +- ompi/mca/mpool/sm/mpool_sm_component.c | 18 +- ompi/mca/mpool/sm/mpool_sm_module.c | 30 +- 25 files changed, 2076 insertions(+), 355 deletions(-) create mode 100644 ompi/mca/common/sm/common_sm.c create mode 100644 ompi/mca/common/sm/common_sm.h create mode 100644 ompi/mca/common/sm/common_sm_sysv.c create mode 100644 ompi/mca/common/sm/common_sm_sysv.h create mode 100644 ompi/mca/common/sm/common_sm_windows.c create mode 100644 ompi/mca/common/sm/common_sm_windows.h create mode 100644 ompi/mca/common/sm/configure.m4 diff --git a/AUTHORS b/AUTHORS index 3604b30029..1478c3cb74 100644 --- a/AUTHORS +++ b/AUTHORS @@ -62,6 +62,7 @@ rolfv Rolf Vandevaart Sun rta Rob Awles LANL rusraink Rainer Keller HLRS, ORNL sami Sami Ayyorgun LANL +samuel Samuel K. Gutierrez LANL santhana Gopal Santhanaraman OSU sharonm Sharon Melamed Voltaire shiqing Shiqing Fan HLRS diff --git a/LICENSE b/LICENSE index 771d37ee6c..eda134fd20 100644 --- a/LICENSE +++ b/LICENSE @@ -14,7 +14,7 @@ Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, University of Stuttgart. All rights reserved. Copyright (c) 2004-2007 The Regents of the University of California. All rights reserved. -Copyright (c) 2006-2008 Los Alamos National Security, LLC. All rights +Copyright (c) 2006-2010 Los Alamos National Security, LLC. All rights reserved. Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. Copyright (c) 2006-2008 Voltaire, Inc. All rights reserved. diff --git a/README b/README index b666d4da6d..4e8ee9e73e 100644 --- a/README +++ b/README @@ -895,6 +895,10 @@ for a full list); a summary of the more commonly used ones follows: with different endian representations). Heterogeneous support is disabled by default because it imposes a minor performance penalty. +--enable-sysv + Enable System V (sysv) shared memory support. By default, System V + shared memory support is disabled. + --with-wrapper-cflags= --with-wrapper-cxxflags= --with-wrapper-fflags= diff --git a/contrib/platform/win32/CMakeModules/check_mca_subdirs.cmake b/contrib/platform/win32/CMakeModules/check_mca_subdirs.cmake index 9548072b37..105f9899c8 100644 --- a/contrib/platform/win32/CMakeModules/check_mca_subdirs.cmake +++ b/contrib/platform/win32/CMakeModules/check_mca_subdirs.cmake @@ -87,7 +87,7 @@ FOREACH (MCA_FRAMEWORK ${MCA_FRAMEWORK_LIST}) IF(EXISTS "${CURRENT_PATH}/.windows") #MESSAGE("MCA_FRAMEWORK_BASE_FILES:${MCA_FRAMEWORK_BASE_FILES}") - SET(EXCLUDE_LIST "") + SET(EXCLUDE_LIST"") FILE(STRINGS ${CURRENT_PATH}/.windows EXCLUDE_LIST REGEX "^exclude_list=") IF(NOT EXCLUDE_LIST STREQUAL "") @@ -122,6 +122,19 @@ FOREACH (MCA_FRAMEWORK ${MCA_FRAMEWORK_LIST}) FILE(GLOB COMPONENT_FILES "${CURRENT_PATH}/*.C" "${CURRENT_PATH}/*.h" "${CURRENT_PATH}/*.cc" "${CURRENT_PATH}/*.cpp") + #check exclude list + SET(EXCLUDE_LIST"") + FILE(STRINGS ${CURRENT_PATH}/.windows EXCLUDE_LIST REGEX "^exclude_list=") + + IF(NOT EXCLUDE_LIST STREQUAL "") + STRING(REPLACE "exclude_list=" "" EXCLUDE_LIST ${EXCLUDE_LIST}) + ENDIF(NOT EXCLUDE_LIST STREQUAL "") + + # remove the files in the exclude list + FOREACH(FILE ${EXCLUDE_LIST}) + LIST(REMOVE_ITEM MCA_FRAMEWORK_BASE_FILES "${CURRENT_PATH}/${FILE}") + ENDFOREACH(FILE) + # by default, build this component. SET(BUILD_COMPONENT TRUE) diff --git a/contrib/platform/win32/CMakeModules/ompi_check_Microsoft.cmake b/contrib/platform/win32/CMakeModules/ompi_check_Microsoft.cmake index d5f7a10b64..c2bcb1b90f 100644 --- a/contrib/platform/win32/CMakeModules/ompi_check_Microsoft.cmake +++ b/contrib/platform/win32/CMakeModules/ompi_check_Microsoft.cmake @@ -171,6 +171,9 @@ OMPI_DEF_VAR(HAVE_INTERLOCKEDCOMPAREEXCHANGE64 "Whether we support 64 bits atomi OMPI_DEF_VAR(HAVE_INTERLOCKEDCOMPAREEXCHANGEACQUIRE "Whether we support 32 bits atomic operations on Windows" 0 0) OMPI_DEF_VAR(HAVE_INTERLOCKEDCOMPAREEXCHANGERELEASE "Whether we support 32 bits atomic operations on Windows" 0 0) +OMPI_DEF(MCA_COMMON_SM_WINDOWS 1 "Whether we have shared memory support for Windows or not." 0 1) +OMPI_DEF(MCA_COMMON_SM_SYSV 0 "Whether we have shared memory support for Windows or not." 0 1) + OMPI_CHECK_INCLUDE_FILE (windows.h HAVE_WINDOWS_H) OMPI_CHECK_INCLUDE_FILE (winsock2.h HAVE_WINSOCK2_H) diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 0cb0ec6c22..39dc711927 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -230,7 +232,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) /* before we multiply by n, make sure the result won't overflow */ /* Stick that little pad in, particularly since we'll eventually * need a little extra space. E.g., in mca_mpool_sm_init() in - * mpool_sm_component.c when sizeof(mca_common_sm_mmap_t) is + * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is * added. */ if ( ((double) res.size) * n > LONG_MAX - 4096 ) @@ -270,13 +272,13 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) /* Pass in a data segment alignment of 0 to get no data segment (only the shared control structure) */ - size = sizeof(mca_common_sm_file_header_t) + + size = sizeof(mca_common_sm_seg_header_t) + n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + CACHE_LINE_SIZE; procs = ompi_proc_world(&num_procs); - if (!(mca_btl_sm_component.mmap_file = - mca_common_sm_mmap_init(procs, num_procs, size, sm_ctl_file, - sizeof(mca_common_sm_file_header_t), - CACHE_LINE_SIZE))) { + if (!(mca_btl_sm_component.sm_seg = + mca_common_sm_init(procs, num_procs, size, sm_ctl_file, + sizeof(mca_common_sm_seg_header_t), + CACHE_LINE_SIZE))) { opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory " "BTL coordinating strucure :: size %lu \n", (unsigned long)size); @@ -289,7 +291,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) /* set the pointer to the shared memory control structure */ mca_btl_sm_component.sm_ctl_header = - (mca_common_sm_file_header_t*)mca_btl_sm_component.mmap_file->map_seg; + (mca_common_sm_seg_header_t*)mca_btl_sm_component.sm_seg->module_seg; /* check to make sure number of local procs is within the @@ -300,7 +302,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) return OMPI_ERROR; } - mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.mmap_file->data_addr; + mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.sm_seg->module_data_addr; mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n); mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n); @@ -538,9 +540,9 @@ int mca_btl_sm_add_procs( /* Sync with other local procs. Force the FIFO initialization to always * happens before the readers access it. */ - opal_atomic_add_32( &mca_btl_sm_component.mmap_file->map_seg->seg_inited, 1); + opal_atomic_add_32( &mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); while( n_local_procs > - mca_btl_sm_component.mmap_file->map_seg->seg_inited) { + mca_btl_sm_component.sm_seg->module_seg->seg_inited) { opal_progress(); opal_atomic_rmb(); } @@ -1106,13 +1108,13 @@ int mca_btl_sm_ft_event(int state) { } if(OPAL_CRS_CHECKPOINT == state) { - if( NULL != mca_btl_sm_component.mmap_file ) { + if( NULL != mca_btl_sm_component.sm_seg ) { /* On restart we need the old file names to exist (not necessarily * contain content) so the CRS component does not fail when searching * for these old file handles. The restart procedure will make sure * these files get cleaned up appropriately. */ - opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, mca_btl_sm_component.mmap_file->map_path); + opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, mca_btl_sm_component.sm_seg->map_path); /* Record the job session directory */ opal_crs_base_metadata_write_token(NULL, CRS_METADATA_MKDIR, orte_process_info.job_session_dir); @@ -1120,11 +1122,11 @@ int mca_btl_sm_ft_event(int state) { } else if(OPAL_CRS_CONTINUE == state) { if( ompi_cr_continue_like_restart ) { - if( NULL != mca_btl_sm_component.mmap_file ) { + if( NULL != mca_btl_sm_component.sm_seg ) { /* Do not Add session directory on continue */ /* Add shared memory file */ - opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false); + opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->map_path, false); } /* Clear this so we force the module to re-init the sm files */ @@ -1133,7 +1135,7 @@ int mca_btl_sm_ft_event(int state) { } else if(OPAL_CRS_RESTART == state || OPAL_CRS_RESTART_PRE == state) { - if( NULL != mca_btl_sm_component.mmap_file ) { + if( NULL != mca_btl_sm_component.sm_seg ) { /* Add session directory */ opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true); tmp_dir = opal_dirname(orte_process_info.job_session_dir); @@ -1143,7 +1145,7 @@ int mca_btl_sm_ft_event(int state) { tmp_dir = NULL; } /* Add shared memory file */ - opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false); + opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->map_path, false); } /* Clear this so we force the module to re-init the sm files */ diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index 52dc83527c..6f39fb6325 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +10,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -67,7 +68,7 @@ #include "ompi/mca/btl/base/base.h" #include "ompi/mca/mpool/mpool.h" -#include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/common/sm/common_sm.h" BEGIN_C_DECLS @@ -151,8 +152,8 @@ struct mca_btl_sm_component_t { size_t eager_limit; /**< first fragment size */ size_t max_frag_size; /**< maximum (second and beyone) fragment size */ opal_mutex_t sm_lock; - mca_common_sm_mmap_t *mmap_file; /**< description of mmap'ed file */ - mca_common_sm_file_header_t *sm_ctl_header; /* control header in + mca_common_sm_module_t *sm_seg; /**< description of shared memory segment */ + mca_common_sm_seg_header_t *sm_ctl_header; /* control header in shared memory */ volatile sm_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ char **shm_bases; /**< pointer to base pointers in shared memory */ @@ -365,30 +366,6 @@ static inline void *sm_fifo_read(sm_fifo_t *fifo) return value; } -/** - * Register shared memory module parameters with the MCA framework - */ -extern int mca_btl_sm_component_open(void); - -/** - * Any final cleanup before being unloaded. - */ -extern int mca_btl_sm_component_close(void); - -/** - * SM module initialization. - * - * @param num_btls (OUT) Number of BTLs returned in BTL array. - * @param enable_progress_threads (IN) Flag indicating whether BTL is allowed to have progress threads - * @param enable_mpi_threads (IN) Flag indicating whether BTL must support multilple simultaneous invocations from different threads - * - */ -extern mca_btl_base_module_t** mca_btl_sm_component_init( - int *num_btls, - bool enable_progress_threads, - bool enable_mpi_threads -); - /** * shared memory component progress. */ diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index f73633b803..bf8b29cb27 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -10,7 +10,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,7 +50,7 @@ #include "opal/mca/base/mca_base_param.h" #include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/btl/base/btl_base_error.h" #if OPAL_ENABLE_FT_CR == 1 @@ -59,6 +61,16 @@ #include "btl_sm_frag.h" #include "btl_sm_fifo.h" +static int mca_btl_sm_component_open(void); +static int mca_btl_sm_component_close(void); +static int sm_register(void); +static mca_btl_base_module_t** mca_btl_sm_component_init( + int *num_btls, + bool enable_progress_threads, + bool enable_mpi_threads +); + + /* * Shared Memory (SM) component instance. */ @@ -74,7 +86,9 @@ mca_btl_sm_component_t mca_btl_sm_component = { OMPI_MINOR_VERSION, /* MCA component minor version */ OMPI_RELEASE_VERSION, /* MCA component release version */ mca_btl_sm_component_open, /* component open */ - mca_btl_sm_component_close /* component close */ + mca_btl_sm_component_close, /* component close */ + NULL, + sm_register, }, { /* The component is checkpoint ready */ @@ -112,12 +126,7 @@ static inline int mca_btl_sm_param_register_int( } -/* - * Called by MCA framework to open the component, registers - * component parameters. - */ - -int mca_btl_sm_component_open(void) +static int sm_register(void) { int i; @@ -162,7 +171,6 @@ int mca_btl_sm_component_open(void) false, false, 0, &mca_btl_sm_component.knem_max_simultaneous); - mca_btl_sm_component.sm_max_btls = 1; /* register SM component parameters */ mca_btl_sm_component.sm_free_list_num = mca_btl_sm_param_register_int("free_list_num", 8); @@ -178,6 +186,32 @@ int mca_btl_sm_component_open(void) mca_btl_sm_param_register_int("fifo_size", 4096); mca_btl_sm_component.nfifos = mca_btl_sm_param_register_int("num_fifos", 1); + + mca_btl_sm_component.fifo_lazy_free = + mca_btl_sm_param_register_int("fifo_lazy_free", 120); + + /* default number of extra procs to allow for future growth */ + mca_btl_sm_component.sm_extra_procs = + mca_btl_sm_param_register_int("sm_extra_procs", 0); + + /* Call the BTL based to register its MCA params */ + mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version, + &mca_btl_sm.super); + + /* Call down to sm common to register its MCA params */ + mca_common_sm_param_register(&mca_btl_sm_component.super.btl_version); + + return OMPI_SUCCESS; +} + +/* + * Called by MCA framework to open the component, registers + * component parameters. + */ + +static int mca_btl_sm_component_open(void) +{ + mca_btl_sm_component.sm_max_btls = 1; /* make sure the number of fifos is a power of 2 */ { int i = 1; @@ -185,8 +219,6 @@ int mca_btl_sm_component_open(void) i <<= 1; mca_btl_sm_component.nfifos = i; } - mca_btl_sm_component.fifo_lazy_free = - mca_btl_sm_param_register_int("fifo_lazy_free", 120); /* make sure that queue size and lazy free parameter are compatible */ if (mca_btl_sm_component.fifo_lazy_free >= (mca_btl_sm_component.fifo_size >> 1) ) @@ -194,9 +226,6 @@ int mca_btl_sm_component_open(void) if (mca_btl_sm_component.fifo_lazy_free <= 0) mca_btl_sm_component.fifo_lazy_free = 1; - /* default number of extra procs to allow for future growth */ - mca_btl_sm_component.sm_extra_procs = - mca_btl_sm_param_register_int("sm_extra_procs", 0); mca_btl_sm.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH-1; mca_btl_sm.super.btl_eager_limit = 4*1024; @@ -214,8 +243,6 @@ int mca_btl_sm_component_open(void) mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */ mca_btl_sm.super.btl_latency = 1; /* Microsecs */ - mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version, - &mca_btl_sm.super); mca_btl_sm_component.max_frag_size = mca_btl_sm.super.btl_max_send_size; mca_btl_sm_component.eager_limit = mca_btl_sm.super.btl_eager_limit; @@ -233,7 +260,7 @@ int mca_btl_sm_component_open(void) * component cleanup - sanity checking of queue lengths */ -int mca_btl_sm_component_close(void) +static int mca_btl_sm_component_close(void) { int return_value = OMPI_SUCCESS; @@ -263,12 +290,12 @@ int mca_btl_sm_component_close(void) /*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags_max);*/ /* unmap the shared memory control structure */ - if(mca_btl_sm_component.mmap_file != NULL) { - return_value = mca_common_sm_mmap_fini( mca_btl_sm_component.mmap_file ); + if(mca_btl_sm_component.sm_seg != NULL) { + return_value = mca_common_sm_fini( mca_btl_sm_component.sm_seg ); if( OMPI_SUCCESS != return_value ) { return_value=OMPI_ERROR; opal_output(0," munmap failed :: file - %s :: errno - %d \n", - mca_btl_sm_component.mmap_file->map_addr, + mca_btl_sm_component.sm_seg->module_seg_addr, errno); goto CLEANUP; } @@ -283,12 +310,12 @@ int mca_btl_sm_component_close(void) */ if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state && OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) { - unlink(mca_btl_sm_component.mmap_file->map_path); + unlink(mca_btl_sm_component.sm_seg->module_seg_path); } #else - unlink(mca_btl_sm_component.mmap_file->map_path); + unlink(mca_btl_sm_component.sm_seg->module_seg_path); #endif - OBJ_RELEASE(mca_btl_sm_component.mmap_file); + OBJ_RELEASE(mca_btl_sm_component.sm_seg); } #if OPAL_ENABLE_PROGRESS_THREADS == 1 @@ -320,7 +347,7 @@ CLEANUP: /* * SM component initialization */ -mca_btl_base_module_t** mca_btl_sm_component_init( +static mca_btl_base_module_t** mca_btl_sm_component_init( int *num_btls, bool enable_progress_threads, bool enable_mpi_threads) diff --git a/ompi/mca/coll/sm/coll_sm.h b/ompi/mca/coll/sm/coll_sm.h index 5d80990c78..12f57a809b 100644 --- a/ompi/mca/coll/sm/coll_sm.h +++ b/ompi/mca/coll/sm/coll_sm.h @@ -28,7 +28,7 @@ #include "opal/datatype/opal_convertor.h" #include "orte/types.h" #include "ompi/mca/coll/coll.h" -#include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/common/sm/common_sm.h" BEGIN_C_DECLS @@ -142,7 +142,7 @@ BEGIN_C_DECLS typedef struct mca_coll_sm_comm_t { /* Meta data that we get back from the common mmap allocation function */ - mca_common_sm_mmap_t *mcb_mmap; + mca_common_sm_module_t *sm_bootstrap_meta; /** Pointer to my barrier control pages (odd index pages are "in", even index pages are "out") */ diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 79781bf3ec..4bdac1970d 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -96,10 +98,10 @@ static void mca_coll_sm_module_destruct(mca_coll_sm_module_t *module) if (NULL != c) { /* Munmap the per-communicator shmem data segment */ - if (NULL != c->mcb_mmap) { + if (NULL != c->sm_bootstrap_meta) { /* Ignore any errors -- what are we going to do about them? */ - mca_common_sm_mmap_fini(c->mcb_mmap); + mca_common_sm_fini(c->sm_bootstrap_meta); } free(c); } @@ -376,7 +378,7 @@ int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module, children are contiguous, so having the first pointer and the num_children from the mcb_tree data is sufficient). */ control_size = c->sm_control_size; - base = data->mcb_mmap->data_addr; + base = data->sm_bootstrap_meta->module_data_addr; data->mcb_barrier_control_me = (uint32_t*) (base + (rank * control_size * num_barrier_buffers * 2)); if (data->mcb_tree[rank].mcstn_parent) { @@ -472,20 +474,20 @@ int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module, OBJ_RETAIN(sm_module->previous_reduce_module); /* Indicate that we have successfully attached and setup */ - opal_atomic_add(&(data->mcb_mmap->map_seg->seg_inited), 1); + opal_atomic_add(&(data->sm_bootstrap_meta->module_seg->seg_inited), 1); /* Wait for everyone in this communicator to attach and setup */ opal_output_verbose(10, mca_coll_base_output, "coll:sm:enable (%d/%s): waiting for peers to attach", comm->c_contextid, comm->c_name); - SPIN_CONDITION(size == data->mcb_mmap->map_seg->seg_inited, seg_init_exit); + SPIN_CONDITION(size == data->sm_bootstrap_meta->module_seg->seg_inited, seg_init_exit); /* Once we're all here, remove the mmap file; it's not needed anymore */ if (0 == rank) { - unlink(data->mcb_mmap->map_path); + unlink(data->sm_bootstrap_meta->module_seg_path); opal_output_verbose(10, mca_coll_base_output, "coll:sm:enable (%d/%s): removed mmap file %s", - comm->c_contextid, comm->c_name, data->mcb_mmap->map_path); + comm->c_contextid, comm->c_name, data->sm_bootstrap_meta->module_seg_path); } /* All done */ @@ -589,13 +591,13 @@ static int bootstrap_comm(ompi_communicator_t *comm, opal_output_verbose(10, mca_coll_base_output, "coll:sm:enable:bootstrap comm (%d/%s): attaching to %" PRIsize_t " byte mmap: %s", comm->c_contextid, comm->c_name, size, fullpath); - data->mcb_mmap = - mca_common_sm_mmap_init_group(comm->c_local_group, size, fullpath, - sizeof(mca_common_sm_file_header_t), - sizeof(void*)); - if (NULL == data->mcb_mmap) { + data->sm_bootstrap_meta = + mca_common_sm_init_group(comm->c_local_group, size, fullpath, + sizeof(mca_common_sm_seg_header_t), + sizeof(void*)); + if (NULL == data->sm_bootstrap_meta) { opal_output_verbose(10, mca_coll_base_output, - "coll:sm:enable:bootstrap comm (%d/%s): common_sm_mmap_init_group failed", + "coll:sm:enable:bootstrap comm (%d/%s): mca_common_sm_init_group failed", comm->c_contextid, comm->c_name); return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/common/sm/.windows b/ompi/mca/common/sm/.windows index c001c66566..470d7d7a06 100644 --- a/ompi/mca/common/sm/.windows +++ b/ompi/mca/common/sm/.windows @@ -9,4 +9,4 @@ # # Specific to this module - +exclude_list=common_sm_mmap.c;common_sm_mmap.h;common_sm_sysv.c;common_sm_sysv.h diff --git a/ompi/mca/common/sm/Makefile.am b/ompi/mca/common/sm/Makefile.am index e70fe28df2..7f3fe22785 100644 --- a/ompi/mca/common/sm/Makefile.am +++ b/ompi/mca/common/sm/Makefile.am @@ -44,13 +44,31 @@ EXTRA_DIST = .windows # Header files headers = \ + common_sm.h \ common_sm_mmap.h # Source files sources = \ + common_sm.c \ common_sm_mmap.c +# Only build the Windows support if we're building on windows, but +# always include the files in the tarball. + +if MCA_common_sm_windows +headers += common_sm_windows.h +sources += common_sm_windows.c +endif + +# Only build the SYSV support if we have the right stuff, but +# always include the files in the tarball. + +if MCA_common_sm_sysv +headers += common_sm_sysv.h +sources += common_sm_sysv.c +endif + # Help file dist_pkgdata_DATA = help-mpi-common-sm.txt diff --git a/ompi/mca/common/sm/common_sm.c b/ompi/mca/common/sm/common_sm.c new file mode 100644 index 0000000000..cde1b28bba --- /dev/null +++ b/ompi/mca/common/sm/common_sm.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_STRING_H +#include +#endif + +#include "ompi/constants.h" +#include "orte/util/show_help.h" +#include "opal/util/argv.h" + +#include "common_sm_mmap.h" +#if MCA_COMMON_SM_SYSV +#include "common_sm_sysv.h" +#endif /* MCA_COMMON_SM_SYSV */ +#if MCA_COMMON_SM_WINDOWS +#include "common_sm_windows.h" +#endif /* MCA_COMMON_SM_WINDOWS */ + +static int initialized = 0; +static int sysv_index = -1; +static char **sm_argv = NULL; +/* let mmap be the default selection */ +static char *sm_params = "mmap"; +static mca_common_sm_init_fn_t sm_init = NULL; +static mca_common_sm_init_group_fn_t sm_init_group = NULL; +static mca_common_sm_seg_alloc_fn_t sm_seg_alloc = NULL; +static mca_common_sm_fini_fn_t sm_fini = NULL; +static char sm_all_buff[OPAL_PATH_MAX]; + +mca_common_sm_module_t *mca_common_sm_module = NULL; + +/******************************************************************************/ +int +mca_common_sm_param_register(mca_base_component_t *c) +{ + char sm_avail_help_str[OPAL_PATH_MAX]; + + if (-1 == sysv_index) + { + if (MCA_COMMON_SM_SYSV) + { + snprintf( + sm_avail_help_str, + sizeof(sm_avail_help_str) - 1, + "Which shared memory support will be used. " + "Valid values: sysv,mmap - or a comma delimited " + "combination of them (order dependent). The first component " + "that is successfully selected is used." + ); + /** + * construct a comma-separated list of valid options for "all". + * notice that we are going to try sysv first. + */ + snprintf(sm_all_buff, sizeof(sm_all_buff) - 1, "sysv,mmap"); + } + else /* only mmap is available */ + { + snprintf( + sm_avail_help_str, + sizeof(sm_avail_help_str) - 1, + "Which shared memory support will be used. " + "Valid values: mmap." + ); + snprintf(sm_all_buff, sizeof(sm_all_buff) - 1, "mmap"); + } + + mca_base_param_reg_string_name("mpi", + "common_sm", + sm_avail_help_str, + false, + false, + sm_params, + &sm_params); + + /* empty == try all available */ + if (0 == strcmp(sm_params, "")) + { + if (NULL == (sm_argv = opal_argv_split(sm_all_buff, ','))) + { + opal_output(0, + "WARNING: could not parse mpi_common_sm request."); + } + } + else + { + if (NULL == (sm_argv = opal_argv_split(sm_params, ','))) + { + opal_output(0, + "WARNING: could not parse mpi_common_sm request."); + } + } + + sysv_index = mca_base_param_reg_int_name( + "mpi", + "common_sm_have_sysv_support", + "Whether shared memory has System V support or not", + false, + true, + MCA_COMMON_SM_SYSV, + NULL + ); + } + + /* Also register MCA param synonyms for the component */ + mca_base_param_reg_syn(sysv_index, c, "have_sysv_support", false); + + return OMPI_SUCCESS; +} + +/******************************************************************************/ +mca_common_sm_module_t * +mca_common_sm_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + if (!initialized) + { + int help_msg_displayed = 0; + int i; + + if (NULL != sm_argv) + { + /** + * iterate through the entire list + * stop when a valid component has been selected. + * + * warn the user when an invalid option was specified, + * but continue searching for a valid alternative. + */ + for (i = 0; NULL != sm_argv[i] && NULL == sm_init; ++i) + { + if (0 == strcasecmp(sm_argv[i], "mmap")) + { +#if !MCA_COMMON_SM_WINDOWS + sm_init = mca_common_sm_mmap_init; + sm_init_group = mca_common_sm_mmap_init_group; + sm_seg_alloc = mca_common_sm_mmap_seg_alloc; + sm_fini = mca_common_sm_mmap_fini; +#else /* MCA_COMMON_SM_WINDOWS */ + sm_init = mca_common_sm_windows_init; + sm_init_group = mca_common_sm_windows_init_group; + sm_seg_alloc = mca_common_sm_windows_seg_alloc; + sm_fini = mca_common_sm_windows_fini; +#endif + } + else if (0 == strcasecmp(sm_argv[i], "sysv")) + { +#if !MCA_COMMON_SM_SYSV + if (!help_msg_displayed) + { + orte_show_help("help-mpi-common-sm.txt", + "sm support", + 1, + sm_argv[i]); + help_msg_displayed = 1; + } +#else /* MCA_COMMON_SM_SYSV */ + /* make sure that we can safely use sysv on this system */ + if (OMPI_SUCCESS == mca_common_sm_sysv_component_query()) + { + sm_init = mca_common_sm_sysv_init; + sm_init_group = mca_common_sm_sysv_init_group; + sm_seg_alloc = mca_common_sm_sysv_seg_alloc; + sm_fini = mca_common_sm_sysv_fini; + } + else /* let the user know that we tried sysv and failed */ + { + orte_show_help("help-mpi-common-sm.txt", + "sysv rt test fail", + 1); + } +#endif + } + else /* unknown value */ + { + if (!help_msg_displayed) + { + orte_show_help("help-mpi-common-sm.txt", + "sm support", + 1, + sm_argv[i]); + help_msg_displayed = 1; + } + } + } + if (NULL != sm_argv) + { + opal_argv_free(sm_argv); + } + } + initialized = 1; + } + + /* call the selected init function */ + if (NULL != sm_init) + { + return sm_init(procs, num_procs, size, + file_name, size_ctl_structure, + data_seg_alignment); + } + return NULL; +} + +/******************************************************************************/ +mca_common_sm_module_t * +mca_common_sm_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + if (NULL != sm_init_group) + { + return sm_init_group(group, size, + file_name, size_ctl_structure, + data_seg_alignment); + } + return NULL; +} + +/******************************************************************************/ +void * +mca_common_sm_seg_alloc(struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration) +{ + if (NULL != sm_seg_alloc) + { + return sm_seg_alloc(mpool, size, registration); + } + return NULL; +} + +/******************************************************************************/ +int +mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module) +{ + if (NULL != sm_fini) + { + return sm_fini(mca_common_sm_module); + } + return OMPI_ERR_NOT_FOUND; +} + diff --git a/ompi/mca/common/sm/common_sm.h b/ompi/mca/common/sm/common_sm.h new file mode 100644 index 0000000000..dfcf12acb2 --- /dev/null +++ b/ompi/mca/common/sm/common_sm.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef _COMMON_SM_H_ +#define _COMMON_SM_H_ + +#include "ompi_config.h" + +#include "opal/mca/mca.h" +#include "opal/class/opal_object.h" +#include "opal/class/opal_list.h" +#include "opal/sys/atomic.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/proc/proc.h" +#include "ompi/group/group.h" + +BEGIN_C_DECLS + +struct mca_mpool_base_module_t; + +typedef struct mca_common_sm_seg_header_t +{ + /* lock to control atomic access */ + opal_atomic_lock_t seg_lock; + /* is the segment ready for use */ + volatile int32_t seg_inited; + /* offset to next available memory location available for allocation */ + size_t seg_offset; + /* total size of the segment */ + size_t seg_size; +} mca_common_sm_seg_header_t; + +typedef struct mca_common_sm_module_t +{ + /* double link list element */ + opal_list_item_t module_item; + /* pointer to header embedded in the shared memory segment */ + mca_common_sm_seg_header_t *module_seg; + /* base address of the segment */ + unsigned char *module_seg_addr; + /* base address of data segment */ + unsigned char *module_data_addr; + /* how big it is (in bytes) */ + size_t module_size; + char module_seg_path[OPAL_PATH_MAX]; +#if defined(__WINDOWS__) + /* handle to the object */ + HANDLE hMappedObject; +#endif /* defined(__WINDOWS__) */ +} mca_common_sm_module_t; + +OBJ_CLASS_DECLARATION(mca_common_sm_module_t); + +OMPI_DECLSPEC extern int +mca_common_sm_param_register(mca_base_component_t *c); + +/** + * Register the MCA parameters for common sm. + */ +int +mca_common_sm_param_register(mca_base_component_t *c); + +/** + * This routine is used to set up a shared memory segment (whether + * it's an mmaped file or a SYSV IPC segment). It is assumed that + * the shared memory segment does not exist before any of the current + * set of processes try and open it. + * + * @param procs - array of (ompi_proc_t*)'s to create this shared + * memory segment for. This array must be writable; it may be edited + * (in undefined ways) if the array contains procs that are not on + * this host. It is assumed that the caller will simply free this + * array upon return. (INOUT) + * + * @param num_procs - length of the procs array (IN) + * + * @param size - size of the segment, in bytes (IN) + * + * @param name - unique string identifier of this segment (IN) + * + * @param size_ctl_structure size of the control structure at + * the head of the segment. The control structure + * is assumed to have mca_common_sm_seg_header_t + * as its first segment (IN) + * + * @param data_set_alignment alignment of the data segment. this + * follows the control structure. If this + * value if 0, then assume that there will + * be no data segment following the control + * structure. (IN) + * + * @returnvalue pointer to control structure at head of shared memory segment. + */ +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +typedef mca_common_sm_module_t * +(*mca_common_sm_init_fn_t)(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +/** + * This routine is used to set up a shared memory segment (whether + * it's an mmaped file or a SYSV IPC segment). It is assumed that + * the shared memory segment does not exist before any of the current + * set of processes try and open it. + * + * This routine is the same as mca_common_sm_mmap_init() except that + * it takes an (ompi_group_t*) parameter to specify the peers rather + * than an array of procs. Unlike mca_common_sm_mmap_init(), the + * group must contain *only* local peers, or this function will return + * NULL and not create any shared memory segment. + */ +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +typedef mca_common_sm_module_t * +(*mca_common_sm_init_group_fn_t)(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +/** + * callback from the sm mpool + */ +OMPI_DECLSPEC extern void * +mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, + size_t* size, + mca_mpool_base_registration_t **registration); + +typedef void * +(*mca_common_sm_seg_alloc_fn_t)(struct mca_mpool_base_module_t *mpool, + size_t* size, + mca_mpool_base_registration_t **registration); + +/** + * This function will release all local resources attached to the + * shared memory segment. We assume that the operating system will + * release the memory resources when the last process release it. + * + * @param mca_common_sm_module - instance that is shared between + * components that use shared memory. + * + * @returnvalue 0 if everything was OK, otherwise a negative value. + */ + +OMPI_DECLSPEC extern int +mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module); + +typedef int +(*mca_common_sm_fini_fn_t)(mca_common_sm_module_t *mca_common_sm_module); + +/* + * instance that is shared between components that use shared memory + */ +OMPI_DECLSPEC extern mca_common_sm_module_t *mca_common_sm_module; + +END_C_DECLS + +#endif /* _COMMON_SM_H_ */ + diff --git a/ompi/mca/common/sm/common_sm_mmap.c b/ompi/mca/common/sm/common_sm_mmap.c index 583412d734..7e5748014b 100644 --- a/ompi/mca/common/sm/common_sm_mmap.c +++ b/ompi/mca/common/sm/common_sm_mmap.c @@ -11,8 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,7 +63,7 @@ #include "common_sm_mmap.h" OBJ_CLASS_INSTANCE( - mca_common_sm_mmap_t, + mca_common_sm_module_mmap_t, opal_object_t, NULL, NULL @@ -91,23 +91,23 @@ typedef struct { opal_list_item_t super; char file_name[OPAL_PATH_MAX]; int sm_file_inited; -} pending_rml_msg_t; +} pending_mmap_rml_msg_t; -OBJ_CLASS_INSTANCE(pending_rml_msg_t, opal_list_item_t, NULL, NULL); +OBJ_CLASS_INSTANCE(pending_mmap_rml_msg_t, opal_list_item_t, NULL, NULL); -#if !defined(__WINDOWS__) - -static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +static mca_common_sm_module_mmap_t * +create_map(int fd, size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) { - mca_common_sm_mmap_t *map; - mca_common_sm_file_header_t *seg; + mca_common_sm_module_mmap_t *map; + mca_common_sm_seg_header_t *seg; unsigned char *addr = NULL; /* map the file and initialize segment state */ - seg = (mca_common_sm_file_header_t*) - mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + seg = (mca_common_sm_seg_header_t *) + mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (MAP_FAILED == seg) { orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1, orte_process_info.nodename, @@ -117,12 +117,12 @@ static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name, } /* set up the map object */ - map = OBJ_NEW(mca_common_sm_mmap_t); - strncpy(map->map_path, file_name, OPAL_PATH_MAX); + map = OBJ_NEW(mca_common_sm_module_mmap_t); + strncpy(map->super.module_seg_path, file_name, OPAL_PATH_MAX); /* the first entry in the file is the control structure. The first - entry in the control structure is an mca_common_sm_file_header_t + entry in the control structure is an mca_common_sm_seg_header_t element */ - map->map_seg = seg; + map->super.module_seg = seg; addr = ((unsigned char *)seg) + size_ctl_structure; /* If we have a data segment (i.e., if 0 != data_seg_alignment), @@ -142,28 +142,39 @@ static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name, return NULL; } } - map->data_addr = addr; - map->map_addr = (unsigned char *)seg; - map->map_size = size; + map->super.module_data_addr = addr; + map->super.module_seg_addr = (unsigned char *)seg; + map->super.module_size = size; return map; } -mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, - size_t num_procs, - size_t size, char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +/******************************************************************************/ +/** + * mca_common_sm_mmap_component_query + */ +int +mca_common_sm_mmap_component_query(void) +{ + return OMPI_SUCCESS; +} + +mca_common_sm_module_t * +mca_common_sm_mmap_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) { int fd = -1; - mca_common_sm_mmap_t* map = NULL; + mca_common_sm_module_mmap_t *map = NULL; size_t mem_offset, p; int rc = 0, sm_file_inited = 0, num_local_procs; struct iovec iov[3]; int sm_file_created = OMPI_RML_TAG_SM_BACK_FILE_CREATED; char filename_to_send[OPAL_PATH_MAX]; opal_list_item_t *item; - pending_rml_msg_t *rml_msg; + pending_mmap_rml_msg_t *rml_msg; ompi_proc_t *temp_proc; bool found_lowest = false; @@ -225,7 +236,7 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, if (0 == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, &(procs[0]->proc_name))) { - /* Check, whether the specified filename is on a network file system */ + /* check, whether the specified filename is on a network file system */ if (opal_path_nfs(file_name)) { orte_show_help("help-mpi-common-sm.txt", "mmap on nfs", 1, orte_process_info.nodename, file_name); @@ -255,11 +266,13 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, /* initialize the segment - only the first process to open the file */ - mem_offset = map->data_addr - (unsigned char *)map->map_seg; - map->map_seg->seg_offset = mem_offset; - map->map_seg->seg_size = size - mem_offset; - opal_atomic_unlock(&map->map_seg->seg_lock); - map->map_seg->seg_inited = 0; + mem_offset = + map->super.module_data_addr - + (unsigned char *)map->super.module_seg; + map->super.module_seg->seg_offset = mem_offset; + map->super.module_seg->seg_size = size - mem_offset; + opal_atomic_unlock(&map->super.module_seg->seg_lock); + map->super.module_seg->seg_inited = 0; } else { close(fd); unlink(file_name); @@ -280,7 +293,7 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, /* Free it all -- bad things are going to happen */ if (1 == sm_file_inited) { - munmap(map, size); + munmap(map->super.module_seg_addr, size); close(fd); unlink(file_name); fd = -1; @@ -299,7 +312,7 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, for (item = opal_list_get_first(&pending_rml_msgs); opal_list_get_end(&pending_rml_msgs) != item; item = opal_list_get_next(item)) { - rml_msg = (pending_rml_msg_t*) item; + rml_msg = (pending_mmap_rml_msg_t*) item; if (0 == strcmp(rml_msg->file_name, file_name)) { opal_list_remove_item(&pending_rml_msgs, item); sm_file_inited = rml_msg->sm_file_inited; @@ -331,7 +344,7 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, } /* If not, put it on the pending list and try again */ - rml_msg = OBJ_NEW(pending_rml_msg_t); + rml_msg = OBJ_NEW(pending_mmap_rml_msg_t); if (NULL == rml_msg) { ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); /* fd/map wasn't opened here; no need to close/reset */ @@ -362,128 +375,8 @@ out: close(fd); } - return map; + return &(map->super); } -#else - -mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, - size_t num_procs, - size_t size, char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) -{ - int fd = -1, return_code = OMPI_SUCCESS; - bool file_previously_opened = false; - mca_common_sm_file_header_t* seg = NULL; - mca_common_sm_mmap_t* map = NULL; - unsigned char *addr = NULL; - size_t tmp, mem_offset; - - HANDLE hMapObject = INVALID_HANDLE_VALUE; - LPVOID lpvMem = NULL; - char *temp1, *temp2; - int rc; - - /** - * On Windows the shared file will be created by the OS directly on - * the system ressources. Therefore, no file get involved in the - * operation. However, a unique key should be used as name for the - * shared memory object in order to allow all processes to access - * the same unique shared memory region. The key will be obtained - * from the original file_name by replacing all path separator - * occurences by '/' (as '\' is not allowed on the object name). - */ - temp1 = strdup(file_name); - temp2 = temp1; - while( NULL != (temp2 = strchr(temp2, OPAL_PATH_SEP[0])) ) { - *temp2 = '/'; - } - hMapObject = CreateFileMapping( INVALID_HANDLE_VALUE, /* use paging file */ - NULL, /* no security attributes */ - PAGE_READWRITE, /* read/write access */ - 0, /* size: high 32-bits */ - (DWORD)size, /* size: low 32-bits */ - temp1); /* name of map object */ - if( NULL == hMapObject ) { - rc = GetLastError(); - goto return_error; - } - if( ERROR_ALREADY_EXISTS == GetLastError() ) - file_previously_opened=true; - free(temp1); /* relase the temporary file name */ - - /* Get a pointer to the file-mapped shared memory. */ - lpvMem = MapViewOfFile( hMapObject, /* object to map view of */ - FILE_MAP_WRITE, /* read/write access */ - 0, /* high offset: map from */ - 0, /* low offset: beginning */ - 0); /* default: map entire file */ - if( NULL == lpvMem ) { - rc = GetLastError(); - goto return_error; - } - seg = (mca_common_sm_file_header_t*)lpvMem; - - /* set up the map object */ - map = OBJ_NEW(mca_common_sm_mmap_t); - strncpy(map->map_path, file_name, OPAL_PATH_MAX); - /* the first entry in the file is the control structure. The first - entry in the control structure is an mca_common_sm_file_header_t - element */ - map->map_seg = seg; - - /* If we have a data segment (i.e., if 0 != data_seg_alignment), - then make it the first aligned address after the control - structure. */ - if (0 != data_seg_alignment) { - addr = ((unsigned char *) seg) + size_ctl_structure; - /* calculate how far off alignment we are */ - tmp = ((size_t) addr) % data_seg_alignment; - /* if we're off alignment, then move up to the next alignment */ - if( tmp > 0 ) - addr += (data_seg_alignment - tmp); - - /* is addr past end of file ? */ - if( (unsigned char*)seg+size < addr ) { - opal_output(0, "mca_common_sm_mmap_init: memory region too small len %d addr %p\n", - size,addr); - goto return_error; - } - map->data_addr = addr; - } else { - map->data_addr = NULL; - } - mem_offset = addr-(unsigned char *)seg; - map->map_addr = (unsigned char *)seg; - map->map_size = size; - - /* initialize the segment - only the first process to open the file */ - if( !file_previously_opened ) { - opal_atomic_unlock(&seg->seg_lock); - seg->seg_inited = false; - seg->seg_offset = mem_offset; - /* initialize size after subtracting out space used by the header */ - seg->seg_size = size - mem_offset; - } - - map->hMappedObject = hMapObject; - - return map; - - return_error: - { - char* localbuf = NULL; - FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, - NULL, rc, 0, (LPTSTR)&localbuf, 1024, NULL ); - opal_output( 0, "%s\n", localbuf ); - LocalFree( localbuf ); - } - if( NULL != lpvMem ) UnmapViewOfFile( lpvMem ); - if( NULL != hMapObject ) CloseHandle(hMapObject); - - return NULL; -} -#endif /* * Same as mca_common_sm_mmap_init(), but takes an (ompi_group_t*) @@ -492,15 +385,16 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, * This function just checks the group to ensure that all the procs * are local, and if they are, calls mca_common_sm_mmap_init(). */ -mca_common_sm_mmap_t* mca_common_sm_mmap_init_group(ompi_group_t *group, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment) +mca_common_sm_module_t * +mca_common_sm_mmap_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) { size_t i, group_size; ompi_proc_t *proc, **procs; - mca_common_sm_mmap_t *ret; + mca_common_sm_module_t *ret; group_size = ompi_group_size(group); procs = (ompi_proc_t**) malloc(sizeof(ompi_proc_t*) * group_size); @@ -522,23 +416,18 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init_group(ompi_group_t *group, return ret; } -int mca_common_sm_mmap_fini( mca_common_sm_mmap_t* sm_mmap ) +int +mca_common_sm_mmap_fini(mca_common_sm_module_t *mca_common_sm_module) { + mca_common_sm_module_mmap_t *mmap_module = + (mca_common_sm_module_mmap_t *)mca_common_sm_module; int rc = OMPI_SUCCESS; - if( NULL != sm_mmap->map_seg ) { -#if !defined(__WINDOWS__) - rc = munmap((void*) sm_mmap->map_addr, sm_mmap->map_size ); - sm_mmap->map_addr = NULL; - sm_mmap->map_size = 0; -#else - BOOL return_error = UnmapViewOfFile( sm_mmap->map_addr ); - if( false == return_error ) { - rc = GetLastError(); - } - CloseHandle(sm_mmap->hMappedObject); - -#endif /* !defined(__WINDOWS__) */ + if( NULL != mmap_module->super.module_seg ) { + rc = munmap((void*) mmap_module->super.module_seg_addr, + mmap_module->super.module_size); + mmap_module->super.module_seg_addr = NULL; + mmap_module->super.module_size = 0; } return rc; } @@ -552,14 +441,15 @@ int mca_common_sm_mmap_fini( mca_common_sm_mmap_t* sm_mmap ) * @retval addr virtual address */ -void* mca_common_sm_mmap_seg_alloc( - struct mca_mpool_base_module_t* mpool, - size_t* size, - mca_mpool_base_registration_t** registration) +void * +mca_common_sm_mmap_seg_alloc(struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration) { mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t*) mpool; - mca_common_sm_mmap_t *map = sm_module->sm_common_mmap; - mca_common_sm_file_header_t* seg = map->map_seg; + mca_common_sm_module_mmap_t *map = + (mca_common_sm_module_mmap_t *)sm_module->sm_common_module; + mca_common_sm_seg_header_t* seg = map->super.module_seg; void* addr; opal_atomic_lock(&seg->seg_lock); @@ -569,7 +459,7 @@ void* mca_common_sm_mmap_seg_alloc( size_t fixup; /* add base address to segment offset */ - addr = map->data_addr + seg->seg_offset; + addr = map->super.module_data_addr + seg->seg_offset; seg->seg_offset += *size; /* fix up seg_offset so next allocation is aligned on a diff --git a/ompi/mca/common/sm/common_sm_mmap.h b/ompi/mca/common/sm/common_sm_mmap.h index 0ccc741c4d..dbd6f79fc5 100644 --- a/ompi/mca/common/sm/common_sm_mmap.h +++ b/ompi/mca/common/sm/common_sm_mmap.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,47 +30,18 @@ #include "ompi/mca/mpool/mpool.h" #include "ompi/proc/proc.h" #include "ompi/group/group.h" +#include "ompi/mca/common/sm/common_sm.h" BEGIN_C_DECLS struct mca_mpool_base_module_t; -typedef struct mca_common_sm_file_header_t { - /* lock to control atomic access */ - opal_atomic_lock_t seg_lock; - - /* is the segment ready for use */ - volatile int32_t seg_inited; - - /* Offset to next available memory location available for allocation */ - size_t seg_offset; - - /* total size of the segment */ - size_t seg_size; -} mca_common_sm_file_header_t; - - -typedef struct mca_common_sm_mmap_t { - /* double link list element */ - opal_list_item_t map_item; - /* pointer to header embedded in the shared memory file */ - mca_common_sm_file_header_t *map_seg; - /* base address of the mmap'ed file */ - unsigned char *map_addr; - /* base address of data segment */ - unsigned char *data_addr; - /* How big it is (in bytes) */ - size_t map_size; - /* Filename */ - char map_path[OPAL_PATH_MAX]; -#if defined(__WINDOWS__) - /* Handle to the object */ - HANDLE hMappedObject; -#endif /* defined(__WINDOWS__) */ -} mca_common_sm_mmap_t; - -OBJ_CLASS_DECLARATION(mca_common_sm_mmap_t); +typedef struct mca_common_sm_module_mmap_t +{ + mca_common_sm_module_t super; +} mca_common_sm_module_mmap_t; +OBJ_CLASS_DECLARATION(mca_common_sm_module_mmap_t); /** * This routine is used to set up a shared memory file, backed @@ -90,7 +63,7 @@ OBJ_CLASS_DECLARATION(mca_common_sm_mmap_t); * * @param size_ctl_structure size of the control structure at * the head of the file. The control structure - * is assumed to have mca_common_sm_file_header_t + * is assumed to have mca_common_sm_seg_header_t * as its first segment (IN) * * @param data_set_alignment alignment of the data segment. this @@ -101,14 +74,13 @@ OBJ_CLASS_DECLARATION(mca_common_sm_mmap_t); * * @return value pointer to control structure at head of file. */ -OMPI_DECLSPEC extern -mca_common_sm_mmap_t* mca_common_sm_mmap_init( - ompi_proc_t **procs, - size_t num_procs, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment); +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_mmap_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); /** * This routine is used to set up a shared memory file, backed @@ -122,22 +94,20 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init( * group must contain *only* local peers, or this function will return * NULL and not create any shared memory segment. */ -OMPI_DECLSPEC extern -mca_common_sm_mmap_t* mca_common_sm_mmap_init_group( - ompi_group_t *group, - size_t size, - char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment); +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_mmap_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); /* * Callback from the sm mpool */ -OMPI_DECLSPEC extern -void* mca_common_sm_mmap_seg_alloc( - struct mca_mpool_base_module_t* mpool, - size_t* size, - mca_mpool_base_registration_t** registration); +OMPI_DECLSPEC extern void * +mca_common_sm_mmap_seg_alloc(struct mca_mpool_base_module_t *mpool, + size_t *size, + mca_mpool_base_registration_t **registration); /** * This function will release all local resources attached to the @@ -149,8 +119,14 @@ void* mca_common_sm_mmap_seg_alloc( * @returnvalue 0 if everything was OK, otherwise a negative value. */ -OMPI_DECLSPEC extern -int mca_common_sm_mmap_fini( mca_common_sm_mmap_t* sm_mmap ); +OMPI_DECLSPEC extern int +mca_common_sm_mmap_fini(mca_common_sm_module_t *mca_common_sm_module); + +/** + * component query routine + */ +OMPI_DECLSPEC extern int +mca_common_sm_mmap_component_query(void); END_C_DECLS diff --git a/ompi/mca/common/sm/common_sm_sysv.c b/ompi/mca/common/sm/common_sm_sysv.c new file mode 100644 index 0000000000..624743cfd6 --- /dev/null +++ b/ompi/mca/common/sm/common_sm_sysv.c @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#ifdef HAVE_FCNTL_H +#include +#endif /* HAVE_FCNTL_H */ +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ +#if MCA_COMMON_SM_SYSV +#include +#include +#endif /* MCA_COMMON_SM_SYSV */ + +#include "opal/util/output.h" +#include "opal/util/path.h" +#include "opal/align.h" +#include "opal/threads/mutex.h" +#include "opal/util/opal_sos.h" + +#include "orte/mca/rml/rml.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "ompi/constants.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/dpm/dpm.h" +#include "ompi/mca/mpool/sm/mpool_sm.h" +#include "common_sm_sysv.h" + +OBJ_CLASS_INSTANCE( + mca_common_sm_module_sysv_t, + opal_object_t, + NULL, + NULL +); + +/** + * lock to protect multiple instances of sysv_init() from + * being invoked simultaneously (because of RML usage). + */ +static opal_mutex_t mutex; + +/** + * list of RML messages that have arrived that have not yet been + * consumed by the thread who is looking to attach to the shared + * memory segment that the RML message corresponds to. + */ +static opal_list_t pending_rml_msgs; +static bool pending_rml_msgs_init = false; + +/** + * items on the pending_rml_msgs list + */ +typedef struct +{ + opal_list_item_t super; + char file_name[OPAL_PATH_MAX]; + int shmem_seg_inited; + int shmid; +} pending_sysv_rml_msg_t; + +OBJ_CLASS_INSTANCE( + pending_sysv_rml_msg_t, + opal_list_item_t, + NULL, + NULL +); + +static mca_common_sm_module_sysv_t * +create_shmem_seg(int shmid, + int is_root, + size_t size, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + unsigned char *addr = NULL; + mca_common_sm_module_sysv_t *map; + mca_common_sm_seg_header_t *seg; + + /* attach to the shared memory segment */ + if ((mca_common_sm_seg_header_t *)-1 == + (seg = (mca_common_sm_seg_header_t *)shmat(shmid, NULL, 0))) + { + int err = errno; + /** + * something really bad happened. + */ + orte_show_help("help-mpi-common-sm.txt", + "sys call fail", + 1, + orte_process_info.nodename, + "shmat(2)", + "", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + strerror(err), + err); + return NULL; + } + /** + * only the the root will set IPC_RMID + */ + if (is_root) + { + /** + * mark the segment for destruction immediately after shmat. our hope + * is that the segment will only actually be destroyed after the last + * process detaches from it (i.e., when the shm_nattch member of the + * associated structure shmid_ds is zero). if we are here, we should + * be okay - our run-time test reported adequate system support. + */ + if (-1 == shmctl(shmid, IPC_RMID, NULL)) + { + int err = errno; + orte_show_help("help-mpi-common-sm.txt", + "sys call fail", + 1, + orte_process_info.nodename, + "shmctl(2)", + "", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + strerror(err), + err); + shmdt(seg); + return NULL; + } + } + + /** + * if we are here, shmctl(shmid, IPC_RMID, NULL) was successful, so we + * don't have to worry about segment cleanup - the OS -should- take care + * of it - happy days... + */ + + /* set up the map object */ + map = OBJ_NEW(mca_common_sm_module_sysv_t); + /** + * the first entry in the file is the control structure. The first + * entry in the control structure is an mca_common_sm_seg_header_t + * element + */ + map->super.module_seg = seg; + + addr = ((unsigned char *)seg) + size_ctl_structure; + /** + * if we have a data segment (i.e., if 0 != data_seg_alignment), + * then make it the first aligned address after the control + * structure. IF THIS HAPPENS, THIS IS A PROGRAMMING ERROR IN + * OPEN MPI! + */ + if (0 != data_seg_alignment) + { + addr = OPAL_ALIGN_PTR(addr, data_seg_alignment, unsigned char *); + + /* is addr past the end of the shared memory segment ? */ + if ((unsigned char *)seg + size < addr) + { + orte_show_help("help-mpi-common-sm.txt", + "mmap too small", + 1, + orte_process_info.nodename, + (unsigned long)size, + (unsigned long)size_ctl_structure, + (unsigned long)data_seg_alignment); + return NULL; + } + } + + map->super.module_data_addr = addr; + map->super.module_seg_addr = (unsigned char *)seg; + map->super.module_size = size; + + return map; +} + +/******************************************************************************/ +/** + * mca_common_sm_sysv_component_query + * the run-time test + */ +int +mca_common_sm_sysv_component_query(void) +{ + char c = 'j'; + int shmid = -1; + int rc = OMPI_ERR_NOT_SUPPORTED; + char *a = NULL; + char *addr = (char *)-1; + struct shmid_ds tmp_buff; + + if (-1 == (shmid = shmget(IPC_PRIVATE, + (size_t)(getpagesize()), + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) + { + goto out; + } + else if ((char *)-1 == (addr = (char *)shmat(shmid, NULL, 0))) + { + goto out; + } + + /* protect against lazy establishment - may not be needed, but can't hurt */ + a = addr; + *a = c; + + if (-1 == shmctl(shmid, IPC_RMID, NULL)) + { + goto out; + } + else if (-1 == shmctl(shmid, IPC_STAT, &tmp_buff)) + { + goto out; + } + else /* all is well - rainbows and butterflies */ + { + rc = OMPI_SUCCESS; + } + +out: + if ((char *)-1 != addr) + { + shmdt(addr); + } + return rc; +} + +/******************************************************************************/ +/** + * mca_common_sm_sysv_init + */ +mca_common_sm_module_t * +mca_common_sm_sysv_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + mca_common_sm_module_sysv_t *map = NULL; + bool found_lowest = false; + int shmid = -1; + int rc = 0; + size_t num_local_procs = 0; + size_t mem_offset; + size_t p; + struct iovec iov[2]; + char filename_to_send[OPAL_PATH_MAX]; + opal_list_item_t *item; + pending_sysv_rml_msg_t *rml_msg; + ompi_proc_t *temp_proc; + + /** + * reorder procs array to have all the local procs at the beginning. + * simultaneously look for the local proc with the lowest name. ensure + * that procs[0] is the lowest named process. + */ + for (p = 0; p < num_procs; ++p) + { + if (OPAL_PROC_ON_LOCAL_NODE(procs[p]->proc_flags)) + { + /* if we don't have a lowest, save the first one */ + if (!found_lowest) + { + procs[0] = procs[p]; + found_lowest = true; + } + else + { + /* save this proc */ + procs[num_local_procs] = procs[p]; + /** + * if we have a new lowest, swap it with position 0 + * so that procs[0] is always the lowest named proc + */ + if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, + &(procs[p]->proc_name), + &(procs[0]->proc_name)) < 0) + { + temp_proc = procs[0]; + procs[0] = procs[p]; + procs[num_local_procs] = temp_proc; + } + } + /** + * Regardless of the comparisons above, we found + * another proc on the local node, so increment + */ + ++num_local_procs; + } + } + + /* if there are no local procs, there's nothing to do */ + if (0 == num_local_procs) + { + return NULL; + } + + strncpy(filename_to_send, file_name, sizeof(filename_to_send) - 1); + + iov[0].iov_base = &shmid; + iov[0].iov_len = sizeof(shmid); + iov[1].iov_base = filename_to_send; + iov[1].iov_len = sizeof(filename_to_send); + + /** + * lock here to prevent multiple threads from invoking this function + * simultaneously. the critical section we're protecting is usage of + * the RML in this block. + */ + opal_mutex_lock(&mutex); + + if (!pending_rml_msgs_init) + { + OBJ_CONSTRUCT(&(pending_rml_msgs), opal_list_t); + pending_rml_msgs_init = true; + } + + /** + * figure out if i am the lowest proc in the group (aka "the root"). + * if i am, initialize the shared memory segment. + */ + if (0 == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, + ORTE_PROC_MY_NAME, + &(procs[0]->proc_name))) + { + /* create a new shared memory segment and save the shmid. */ + if (-1 == (shmid = shmget(IPC_PRIVATE, + size, + IPC_CREAT | IPC_EXCL | SHM_R | SHM_W))) + { + /** + * if we are here, a few of things could have happened: + * o the system's shmmax limit is lower than the requested + * segment size. the user can either up shmmax or set + * mpool_sm_min_size to a value less than the system's current + * shmmax limit. + * o something else i don't know about ... + */ + int err = errno; + orte_show_help("help-mpi-common-sm.txt", + "shmget call fail", + 1, + orte_process_info.nodename, + "shmget(2)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + strerror(err), + err, + size); + } + else /* ftok and shmget were both successful */ + { + map = create_shmem_seg(shmid, + 1, /* i am the root */ + size, + size_ctl_structure, + data_seg_alignment); + if (NULL != map) + { + /* initialize the segment */ + mem_offset = + map->super.module_data_addr - + (unsigned char *)map->super.module_seg; + map->super.module_seg->seg_offset = mem_offset; + map->super.module_seg->seg_size = size - mem_offset; + map->super.module_seg->seg_inited = 0; + opal_atomic_unlock(&map->super.module_seg->seg_lock); + } + else + { + /** + * best effort to delete the segment. + * may not be needed, but can't hurt. + */ + shmctl(shmid, IPC_RMID, NULL); + /** + * setting shmid to -1 here will tell + * the other procs that we failed. + */ + shmid = -1; + } + } + + /** + * signal the rest of the local procs that a new shared memory segment + * has successfully been created and is ready to be attached to. bump + * up the libevent polling frequency while we're using the RML. + */ + opal_progress_event_users_increment(); + for (p = 1; p < num_local_procs; ++p) + { + rc = orte_rml.send(&(procs[p]->proc_name), + iov, + 2, + OMPI_RML_TAG_SM_BACK_FILE_CREATED, + 0); + if (rc < (ssize_t)(iov[0].iov_len + + iov[1].iov_len)) + { + ORTE_ERROR_LOG(OMPI_ERR_COMM_FAILURE); + opal_progress_event_users_decrement(); + + /* free it all -- bad things are going to happen */ + if (NULL != map) + { + shmdt(map->super.module_seg_addr); + } + goto out; + } + } + opal_progress_event_users_decrement(); + } + else /* i am NOT the lowest local rank */ + { + /** + * all other procs will wait for the shared memory segment to be + * initialized before attaching to it. because the shared memory + * segment may be initialized simultaneously in multiple threads, + * the RML messages may arrive in any order. so, first check to + * see if we previously received a message for me. + */ + for (item = opal_list_get_first(&pending_rml_msgs); + opal_list_get_end(&pending_rml_msgs) != item; + item = opal_list_get_next(item)) + { + rml_msg = (pending_sysv_rml_msg_t *)item; + /* was the message for me? */ + if (0 == strcmp(rml_msg->file_name, file_name)) + { + opal_list_remove_item(&pending_rml_msgs, item); + /* set the shmid so i know what shared mem seg to attach to */ + shmid = rml_msg->shmid; + OBJ_RELEASE(item); + break; + } + } + + /** + * if we didn't find a message already waiting, block on + * receiving from the RML. + */ + if (opal_list_get_end(&pending_rml_msgs) == item) + { + while (1) + { + /** + * bump up the libevent polling frequency while we're + * in this RML recv, just to ensure we're checking + * libevent more frequently. + */ + opal_progress_event_users_increment(); + rc = orte_rml.recv(&(procs[0]->proc_name), + iov, + 2, + OMPI_RML_TAG_SM_BACK_FILE_CREATED, + 0); + opal_progress_event_users_decrement(); + if (rc < 0) + { + ORTE_ERROR_LOG(OMPI_ERR_RECV_LESS_THAN_POSTED); + goto out; + } + + /* was the message for me? if so, we're done */ + if (0 == strcmp(filename_to_send, file_name)) + { + break; + } + + /* if not, put it on the pending list and try again */ + rml_msg = OBJ_NEW(pending_sysv_rml_msg_t); + if (NULL == rml_msg) + { + ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); + goto out; + } + memcpy(rml_msg->file_name, + filename_to_send, + sizeof(rml_msg->file_name)); + rml_msg->shmid = shmid; + opal_list_append(&pending_rml_msgs, &(rml_msg->super)); + } /* end while 1 */ + } + + /* did the root setup the shmid correctly? if so, attach to it */ + if (-1 != shmid) + { + map = create_shmem_seg(shmid, + 0, /* i am NOT the root */ + size, + size_ctl_structure, + data_seg_alignment); + if (NULL == map) + { + goto out; + } + } + } /* end else - i am NOT the lowest local rank */ + +out: + opal_mutex_unlock(&mutex); + + return &(map->super); +} + +/******************************************************************************/ +/** + * same as mca_common_sm_sysv_init(), but takes an (ompi_group_t *) + * argument instead of an array of ompi_proc_t's. + * + * this function just checks the group to ensure that all the procs + * are local, and if they are, calls mca_common_sm_sysv_init(). + */ +mca_common_sm_module_t * +mca_common_sm_sysv_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + size_t i; + size_t group_size; + ompi_proc_t *proc; + ompi_proc_t **procs; + mca_common_sm_module_t *ret; + + group_size = ompi_group_size(group); + procs = (ompi_proc_t **) malloc(sizeof(ompi_proc_t *) * group_size); + + if (NULL == procs) + { + return NULL; + } + + for (i = 0; i < group_size; ++i) + { + proc = ompi_group_peer_lookup(group,i); + if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) + { + free(procs); + return NULL; + } + procs[i] = proc; + } + + ret = mca_common_sm_sysv_init(procs, + group_size, + size, + file_name, + size_ctl_structure, + data_seg_alignment); + free(procs); + return ret; +} + +/******************************************************************************/ +/** + * sys v module finalization routine. + */ +int +mca_common_sm_sysv_fini(mca_common_sm_module_t *mca_common_sm_module) +{ + int rc = OMPI_SUCCESS; + mca_common_sm_module_sysv_t *sysv_module = + (mca_common_sm_module_sysv_t *)mca_common_sm_module; + + /** + * no need to shmctl to remove the segment, because we set + * IPC_RMID on the segment, meaning that when everyone detaches, + * the OS will automatically delete it. + */ + if (NULL != sysv_module->super.module_seg) + { + rc = shmdt(sysv_module->super.module_seg_addr); + sysv_module->super.module_seg_addr = NULL; + sysv_module->super.module_size = 0; + } + return rc; +} + +/******************************************************************************/ +/** + * allocate memory from a previously allocated shared memory block. + * + * @param size size of request, in bytes (IN) + * + * @retval addr virtual address + */ +void * +mca_common_sm_sysv_seg_alloc(struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration) +{ + mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t*)mpool; + + mca_common_sm_module_sysv_t *map = + (mca_common_sm_module_sysv_t *)sm_module->sm_common_module; + + mca_common_sm_seg_header_t* seg = map->super.module_seg; + + void *addr; + + opal_atomic_lock(&seg->seg_lock); + + if(seg->seg_offset + *size > seg->seg_size) + { + addr = NULL; + } + else + { + size_t fixup; + + /* add base address to segment offset */ + addr = map->super.module_data_addr + seg->seg_offset; + seg->seg_offset += *size; + + /** + * fix up seg_offset so next allocation is aligned on a + * sizeof(long) boundry. do it here so that we don't have to + * check before checking remaining size in buffer + */ + if (0 < (fixup = (seg->seg_offset & (sizeof(long) - 1)))) + { + seg->seg_offset += sizeof(long) - fixup; + } + } + if (NULL != registration) + { + *registration = NULL; + } + opal_atomic_unlock(&seg->seg_lock); + return addr; +} + diff --git a/ompi/mca/common/sm/common_sm_sysv.h b/ompi/mca/common/sm/common_sm_sysv.h new file mode 100644 index 0000000000..339f19a194 --- /dev/null +++ b/ompi/mca/common/sm/common_sm_sysv.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef _COMMON_SM_SYSV_H_ +#define _COMMON_SM_SYSV_H_ + +#include "ompi_config.h" + +#include "opal/class/opal_object.h" +#include "opal/class/opal_list.h" +#include "opal/sys/atomic.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/proc/proc.h" +#include "ompi/group/group.h" +#include "ompi/mca/common/sm/common_sm.h" + +BEGIN_C_DECLS + +struct mca_mpool_base_module_t; + +typedef struct mca_common_sm_module_sysv_t +{ + mca_common_sm_module_t super; +} mca_common_sm_module_sysv_t; + +OBJ_CLASS_DECLARATION(mca_common_sm_module_sysv_t); + +/** + * This routine is used to set up a System V shared memory segment. + * It is assumed that NO shared memory segment already exists with + * key = ftok(file_name, 0) when the "creator proccess" tries to + * shmget(key, size, ...). + * + * @param procs - array of (ompi_proc_t*)'s to create this shared + * memory segment for. This array must be writable; it may be edited + * (in undefined ways) if the array contains procs that are not on + * this host. It is assumed that the caller will simply free this + * array upon return. (INOUT) + * + * @param num_procs - length of the procs array (IN) + * + * @param size - size of the shared memory segment, in bytes (IN) + * + * @param file_name name of file to be opened that is + * used for shmget key generation. (IN) + * + * @param size_ctl_structure size of the control structure at + * the head of the file. The control structure + * is assumed to have mca_common_sm_seg_header_t + * as its first segment (IN) + * + * @param data_set_alignment alignment of the data segment. this + * follows the control structure. If this + * value if 0, then assume that there will + * be no data segment following the control + * structure. (IN) + * + * @return value pointer to control structure at head of file. + */ +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_sysv_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +/** + * This routine is used to set up a System V shared memory segment. + * It is assumed that NO shared memory segment already exists with + * key = ftok(file_name, 0) when the "creator (root) proccess" tries to + * shmget(key, size, ...). + * + * This routine is the same as mca_common_sm_sysv_init() except that + * it takes an (ompi_group_t*) parameter to specify the peers rather + * than an array of procs. Unlike mca_common_sm_sysv_init(), the + * group must contain *only* local peers, or this function will return + * NULL and not create any shared memory segment. + */ +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_sysv_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +/** + * Callback from the sm mpool + */ +OMPI_DECLSPEC extern void * +mca_common_sm_sysv_seg_alloc(struct mca_mpool_base_module_t *mpool, + size_t *size, + mca_mpool_base_registration_t **registration); + +/** + * This function will release all local resources attached to the + * shared memory segment. We assume that the operating system will destroy the + * shared memory segment when the last process detaches from it. + * + * It is assumed that the operating system's System V IPC implementation + * supports the following IPC_RMID semantics. + * + * Calling shmctl(shmid, IPC_RMID, ...) will actually destroy the shared memory + * segment *after* the last process detaches from it (i.e., when the shm_nattch + * member of the associated structure shmid_ds is zero). This behavior is + * important because we rely on it to release all allocated shared memory + * segments upon job termination - including abnormal job termination. + * + * @param mca_common_sm_module - the control structure at head of the segment. + * + * @returnvalue 0 if everything was OK, otherwise a negative value. + */ + +OMPI_DECLSPEC extern int +mca_common_sm_sysv_fini(mca_common_sm_module_t *mca_common_sm_module); + +/** + * component query routine + */ + +OMPI_DECLSPEC extern int +mca_common_sm_sysv_component_query(void); + +END_C_DECLS + +#endif /* _COMMON_SM_SYSV_H_ */ + diff --git a/ompi/mca/common/sm/common_sm_windows.c b/ompi/mca/common/sm/common_sm_windows.c new file mode 100644 index 0000000000..ab69195a26 --- /dev/null +++ b/ompi/mca/common/sm/common_sm_windows.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#ifdef HAVE_FCNTL_H +#include +#endif /* HAVE_FCNTL_H */ +#ifdef HAVE_TIME_H +#include +#endif /* HAVE_TIME_H */ +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/util/path.h" +#include "opal/align.h" +#include "opal/threads/mutex.h" +#include "opal/util/opal_sos.h" + +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "ompi/constants.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/dpm/dpm.h" +#include "ompi/mca/mpool/sm/mpool_sm.h" +#include "common_sm_windows.h" + +OBJ_CLASS_INSTANCE( + mca_common_sm_module_windows_t, + opal_object_t, + NULL, + NULL +); + +/******************************************************************************/ +/** + * mca_common_sm_windows_component_query + */ +int +mca_common_sm_windows_component_query(void) +{ + return OMPI_SUCCESS; +} + +mca_common_sm_module_t * +mca_common_sm_windows_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + int fd = -1, return_code = OMPI_SUCCESS; + bool file_previously_opened = false; + mca_common_sm_seg_header_t* seg = NULL; + mca_common_sm_module_windows_t* map = NULL; + unsigned char *addr = NULL; + size_t tmp, mem_offset; + + HANDLE hMapObject = INVALID_HANDLE_VALUE; + LPVOID lpvMem = NULL; + char *temp1, *temp2; + int rc; + + /** + * On Windows the shared file will be created by the OS directly on + * the system ressources. Therefore, no file get involved in the + * operation. However, a unique key should be used as name for the + * shared memory object in order to allow all processes to access + * the same unique shared memory region. The key will be obtained + * from the original file_name by replacing all path separator + * occurences by '/' (as '\' is not allowed on the object name). + */ + temp1 = strdup(file_name); + temp2 = temp1; + while( NULL != (temp2 = strchr(temp2, OPAL_PATH_SEP[0])) ) { + *temp2 = '/'; + } + hMapObject = CreateFileMapping( INVALID_HANDLE_VALUE, /* use paging file */ + NULL, /* no security attributes */ + PAGE_READWRITE, /* read/write access */ + 0, /* size: high 32-bits */ + (DWORD)size, /* size: low 32-bits */ + temp1); /* name of map object */ + if( NULL == hMapObject ) { + rc = GetLastError(); + goto return_error; + } + if( ERROR_ALREADY_EXISTS == GetLastError() ) + file_previously_opened=true; + free(temp1); /* relase the temporary file name */ + + /* Get a pointer to the file-mapped shared memory. */ + lpvMem = MapViewOfFile( hMapObject, /* object to map view of */ + FILE_MAP_WRITE, /* read/write access */ + 0, /* high offset: map from */ + 0, /* low offset: beginning */ + 0); /* default: map entire file */ + if( NULL == lpvMem ) { + rc = GetLastError(); + goto return_error; + } + seg = (mca_common_sm_seg_header_t*)lpvMem; + + /* set up the map object */ + map = OBJ_NEW(mca_common_sm_module_windows_t); + strncpy(map->super.module_seg_path, file_name, OPAL_PATH_MAX); + /* the first entry in the file is the control structure. The first + entry in the control structure is an mca_common_sm_seg_header_t + element */ + map->super.module_seg = seg; + + /* If we have a data segment (i.e., if 0 != data_seg_alignment), + then make it the first aligned address after the control + structure. */ + if (0 != data_seg_alignment) { + addr = ((unsigned char *) seg) + size_ctl_structure; + /* calculate how far off alignment we are */ + tmp = ((size_t) addr) % data_seg_alignment; + /* if we're off alignment, then move up to the next alignment */ + if( tmp > 0 ) + addr += (data_seg_alignment - tmp); + + /* is addr past end of file ? */ + if( (unsigned char*)seg+size < addr ) { + opal_output(0, "mca_common_sm_init: memory region too small len %d addr %p\n", + size,addr); + goto return_error; + } + map->super.module_data_addr = addr; + } else { + map->super.module_data_addr = NULL; + } + mem_offset = addr-(unsigned char *)seg; + map->super.module_seg_addr = (unsigned char *)seg; + map->super.module_size = size; + + /* initialize the segment - only the first process to open the file */ + if( !file_previously_opened ) { + opal_atomic_unlock(&seg->seg_lock); + seg->seg_inited = false; + seg->seg_offset = mem_offset; + /* initialize size after subtracting out space used by the header */ + seg->seg_size = size - mem_offset; + } + + map->hMappedObject = hMapObject; + + return (mca_common_sm_module_t *)map; + + return_error: + { + char* localbuf = NULL; + FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, + NULL, rc, 0, (LPTSTR)&localbuf, 1024, NULL ); + opal_output( 0, "%s\n", localbuf ); + LocalFree( localbuf ); + } + if( NULL != lpvMem ) UnmapViewOfFile( lpvMem ); + if( NULL != hMapObject ) CloseHandle(hMapObject); + + return NULL; +} + +/* + * Same as mca_common_sm_windows_init(), but takes an (ompi_group_t*) + * argument instead of na array of ompi_proc_t's. + * + * This function just checks the group to ensure that all the procs + * are local, and if they are, calls mca_common_sm_windows_init(). + */ +mca_common_sm_module_t * +mca_common_sm_windows_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment) +{ + size_t i, group_size; + ompi_proc_t *proc, **procs; + mca_common_sm_module_t *ret; + + group_size = ompi_group_size(group); + procs = (ompi_proc_t**) malloc(sizeof(ompi_proc_t*) * group_size); + if (NULL == procs) { + return NULL; + } + for (i = 0; i < group_size; ++i) { + proc = ompi_group_peer_lookup(group,i); + if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { + free(procs); + return NULL; + } + procs[i] = proc; + } + + ret = mca_common_sm_windows_init(procs, group_size, size, file_name, + size_ctl_structure, data_seg_alignment); + free(procs); + return ret; +} + +int +mca_common_sm_windows_fini(mca_common_sm_module_t *mca_common_sm_module) +{ + mca_common_sm_module_windows_t *windows_module = + (mca_common_sm_module_windows_t *)mca_common_sm_module; + int rc = OMPI_SUCCESS; + + if( NULL != windows_module->super.module_seg ) { + BOOL return_error = UnmapViewOfFile( windows_module->super.module_seg_addr ); + if( false == return_error ) { + rc = GetLastError(); + } + CloseHandle(windows_module->super.hMappedObject); + + } + return rc; +} + +/** + * allocate memory from a previously allocated shared memory + * block. + * + * @param size size of request, in bytes (IN) + * + * @retval addr virtual address + */ + +void * +mca_common_sm_windows_seg_alloc(struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration) +{ + mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t*) mpool; + mca_common_sm_module_windows_t *map = + (mca_common_sm_module_windows_t *)sm_module->sm_common_module; + mca_common_sm_seg_header_t* seg = map->super.module_seg; + void* addr; + + opal_atomic_lock(&seg->seg_lock); + if(seg->seg_offset + *size > seg->seg_size) { + addr = NULL; + } else { + size_t fixup; + + /* add base address to segment offset */ + addr = map->super.module_data_addr + seg->seg_offset; + seg->seg_offset += *size; + + /* fix up seg_offset so next allocation is aligned on a + sizeof(long) boundry. Do it here so that we don't have to + check before checking remaining size in buffer */ + if ((fixup = (seg->seg_offset & (sizeof(long) - 1))) > 0) { + seg->seg_offset += sizeof(long) - fixup; + } + } + if (NULL != registration) { + *registration = NULL; + } + opal_atomic_unlock(&seg->seg_lock); + return addr; +} + diff --git a/ompi/mca/common/sm/common_sm_windows.h b/ompi/mca/common/sm/common_sm_windows.h new file mode 100644 index 0000000000..6d024ee575 --- /dev/null +++ b/ompi/mca/common/sm/common_sm_windows.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef _COMMON_SM_WINDOWS_H_ +#define _COMMON_SM_WINDOWS_H_ + +#include "ompi_config.h" + +#include "opal/class/opal_object.h" +#include "opal/class/opal_list.h" +#include "opal/sys/atomic.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/proc/proc.h" +#include "ompi/group/group.h" +#include "ompi/mca/common/sm/common_sm.h" + +BEGIN_C_DECLS + +struct mca_mpool_base_module_t; + +typedef struct mca_common_sm_module_windows_t +{ + mca_common_sm_module_t super; + HANDLE hMappedObject; +} mca_common_sm_module_windows_t; + +OBJ_CLASS_DECLARATION(mca_common_sm_module_windows_t); + +/** + * This routine is used to set up a shared memory file, backed + * by a specified file. It is assumed that the file does not + * exist before any of the current set of processes try and open + * it. + * + * @param procs - array of (ompi_proc_t*)'s to create this shared + * memory segment for. This array must be writable; it may be edited + * (in undefined ways) if the array contains procs that are not on + * this host. It is assumed that the caller will simply free this + * array upon return. (INOUT) + * + * @param num_procs - length of the procs array (IN) + * + * @param size - size of the file, in bytes (IN) + * + * @param file_name name of file to be opened. (IN) + * + * @param size_ctl_structure size of the control structure at + * the head of the file. The control structure + * is assumed to have mca_common_sm_seg_header_t + * as its first segment (IN) + * + * @param data_set_alignment alignment of the data segment. this + * follows the control structure. If this + * value if 0, then assume that there will + * be no data segment following the control + * structure. (IN) + * + * @return value pointer to control structure at head of file. + */ +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_windows_init(ompi_proc_t **procs, + size_t num_procs, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +/** + * This routine is used to set up a shared memory file, backed + * by a specified file. It is assumed that the file does not + * exist before any of the current set of processes try and open + * it. + * + * This routine is the same as mca_common_sm_windows_init() except that + * it takes an (ompi_group_t*) parameter to specify the peers rather + * than an array of procs. Unlike mca_common_sm_windows_init(), the + * group must contain *only* local peers, or this function will return + * NULL and not create any shared memory segment. + */ +OMPI_DECLSPEC extern mca_common_sm_module_t * +mca_common_sm_windows_init_group(ompi_group_t *group, + size_t size, + char *file_name, + size_t size_ctl_structure, + size_t data_seg_alignment); + +/* + * Callback from the sm mpool + */ +OMPI_DECLSPEC extern void * +mca_common_sm_windows_seg_alloc(struct mca_mpool_base_module_t *mpool, + size_t *size, + mca_mpool_base_registration_t **registration); + +/** + * This function will release all local resources attached to the + * mmapped file. We assume that the operating system will destroy the + * file when the last process release it. + * + * @param sm_windows - the control structure at head of file. + * + * @returnvalue 0 if everything was OK, otherwise a negative value. + */ + +OMPI_DECLSPEC extern int +mca_common_sm_windows_fini(mca_common_sm_module_t *mca_common_sm_module); + +/** + * component query routine + */ +OMPI_DECLSPEC extern int +mca_common_sm_windows_component_query(void); + +END_C_DECLS + +#endif + diff --git a/ompi/mca/common/sm/configure.m4 b/ompi/mca/common/sm/configure.m4 new file mode 100644 index 0000000000..05ba26e0fa --- /dev/null +++ b/ompi/mca/common/sm/configure.m4 @@ -0,0 +1,73 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2010 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_common_sm_POST_CONFIG([should_build]) +# ------------------------------------------ +AC_DEFUN([MCA_common_sm_POST_CONFIG], [ + AM_CONDITIONAL([MCA_common_sm_windows], + [test $1 -eq 1 -a "x$MCA_common_sm_windows" = "x1"]) + AM_CONDITIONAL([MCA_common_sm_sysv], + [test $1 -eq 1 -a "x$MCA_common_sm_sysv" = "x1"]) +])dnl + +# MCA_common_sm_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_common_sm_CONFIG], [ + OMPI_VAR_SCOPE_PUSH([MCA_common_sm_windows MCA_common_sm_sysv]) + + # Are we building on Windows? + AC_CHECK_FUNC(CreateFileMapping, + [MCA_common_sm_windows=1], + [MCA_common_sm_windows=0]) + AC_DEFINE_UNQUOTED([MCA_COMMON_SM_WINDOWS], + [$MCA_common_sm_windows], + [Whether we have shared memory support for Windows or not]) + + # do we have sysv shared memory support on this system? + AC_CHECK_FUNC(shmget, + [ompi_check_sysv_happy="yes"], + [ompi_check_sysv_happy="no"]) + + # do we want to enable System V shared memory support? + AC_MSG_CHECKING([if want sysv support]) + AC_ARG_ENABLE(sysv, + AC_HELP_STRING([--enable-sysv], + [enable sysv shared memory support (default: disabled)])) + if test "$enable_sysv" = "yes"; then + if test "$ompi_check_sysv_happy" = "yes"; then + AC_MSG_RESULT([yes]) + MCA_common_sm_sysv=1 + else + MCA_common_sm_sysv=0 + AC_MSG_ERROR([sysv support requested but not found. aborting]) + fi + else + AC_MSG_RESULT([no]) + MCA_common_sm_sysv=0 + fi + AC_DEFINE_UNQUOTED([MCA_COMMON_SM_SYSV], + [$MCA_common_sm_sysv], + [Whether we have shared memory support for SYSV or not]) + +])dnl + diff --git a/ompi/mca/common/sm/help-mpi-common-sm.txt b/ompi/mca/common/sm/help-mpi-common-sm.txt index 9c2500f6b0..99b664881f 100644 --- a/ompi/mca/common/sm/help-mpi-common-sm.txt +++ b/ompi/mca/common/sm/help-mpi-common-sm.txt @@ -1,6 +1,8 @@ # -*- text -*- # # Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2010 Los Alamos National Security, LLC. +# All rights reserved. # # $COPYRIGHT$ # @@ -20,6 +22,19 @@ experience performance degradation. Process: %s Error: %s (errno %d) # +[shmget call fail] +A shmget call failed during shared memory initialization that should +not have. It is likely that your MPI job will now either abort or +experience performance degradation. + + Local host: %s + System call: %s + Process: %s + Error: %s (errno %d) + +Please verify that your system's shmax limit, or equivalent, is larger than +%d. On some Unix-like systems this can be done via: "sysctl -a | grep shm" +# [mmap too small] Open MPI requested a shared memory segment that was too small to do anything useful. This is likely an error in Open MPI itself. If you @@ -49,3 +64,11 @@ the MCA parameter "orte_no_session_dir". Local host: %s Fileame: %s +# +[sm support] +WARNING: "%s" not recognized - ignoring option. Suppressing additional +unrecognized option warnings. +# +[sysv rt test fail] +WARNING: It appears as if your system does not provide the run-time behavior +that we rely on to safely provide System V shared memory support. diff --git a/ompi/mca/mpool/sm/mpool_sm.h b/ompi/mca/mpool/sm/mpool_sm.h index 5e897a7b8e..50c985f403 100644 --- a/ompi/mca/mpool/sm/mpool_sm.h +++ b/ompi/mca/mpool/sm/mpool_sm.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,7 +29,7 @@ #include "opal/event/event.h" -#include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/common/sm/common_sm.h" #include "ompi/mca/mpool/mpool.h" #include "ompi/mca/allocator/allocator.h" @@ -54,7 +56,7 @@ typedef struct mca_mpool_sm_module_t { long sm_size; mca_allocator_base_module_t * sm_allocator; struct mca_mpool_sm_mmap_t *sm_mmap; - mca_common_sm_mmap_t *sm_common_mmap; + mca_common_sm_module_t *sm_common_module; int32_t mem_node; } mca_mpool_sm_module_t; diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index 885c05b7df..bccaf78e38 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,7 +35,7 @@ #include "ompi/mca/allocator/base/base.h" #include "mpool_sm.h" -#include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/common/sm/common_sm.h" #include "ompi/proc/proc.h" #if OPAL_ENABLE_FT_CR == 1 @@ -172,7 +174,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( } /* add something for the control structure */ - mpool_module->sm_size += sizeof(mca_common_sm_mmap_t); + mpool_module->sm_size += sizeof(mca_common_sm_module_t); allocator_component = mca_allocator_component_lookup( mca_mpool_sm_component.sm_allocator_name); @@ -209,11 +211,11 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( "mca_mpool_sm_init: shared memory size used: (%ld)", mpool_module->sm_size); - if (NULL == (mpool_module->sm_common_mmap = - mca_common_sm_mmap_init(procs, num_all_procs, - mpool_module->sm_size, - file_name, - sizeof(mca_common_sm_mmap_t), 8))) { + if (NULL == (mpool_module->sm_common_module = + mca_common_sm_init(procs, num_all_procs, + mpool_module->sm_size, + file_name, + sizeof(mca_common_sm_module_t), 8))) { opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name); free(file_name); @@ -227,7 +229,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( /* setup allocator */ mpool_module->sm_allocator = allocator_component->allocator_init(true, - mca_common_sm_mmap_seg_alloc, + mca_common_sm_seg_alloc, NULL, &(mpool_module->super)); if(NULL == mpool_module->sm_allocator) { opal_output(0, "mca_mpool_sm_init: unable to initialize allocator"); diff --git a/ompi/mca/mpool/sm/mpool_sm_module.c b/ompi/mca/mpool/sm/mpool_sm_module.c index 901697405b..721c402423 100644 --- a/ompi/mca/mpool/sm/mpool_sm_module.c +++ b/ompi/mca/mpool/sm/mpool_sm_module.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,7 +22,7 @@ #include "ompi_config.h" #include #include "ompi/mca/mpool/sm/mpool_sm.h" -#include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/common/sm/common_sm.h" #ifdef HAVE_UNISTD_H #include #endif @@ -57,7 +59,7 @@ void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool) mpool->sm_size = 0; mpool->sm_allocator = NULL; mpool->sm_mmap = NULL; - mpool->sm_common_mmap = NULL; + mpool->sm_common_module = NULL; mpool->mem_node = -1; } @@ -67,8 +69,8 @@ void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool) void* mca_mpool_sm_base(mca_mpool_base_module_t* mpool) { mca_mpool_sm_module_t *sm_mpool = (mca_mpool_sm_module_t*) mpool; - return (NULL != sm_mpool->sm_common_mmap) ? - sm_mpool->sm_common_mmap->map_addr : NULL; + return (NULL != sm_mpool->sm_common_module) ? + sm_mpool->sm_common_module->module_seg_addr : NULL; } /** @@ -132,23 +134,23 @@ static void sm_module_finalize(mca_mpool_base_module_t* module) { mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t*) module; - if (NULL != sm_module->sm_common_mmap) { + if (NULL != sm_module->sm_common_module) { if (OMPI_SUCCESS == - mca_common_sm_mmap_fini(sm_module->sm_common_mmap)) { + mca_common_sm_fini(sm_module->sm_common_module)) { #if OPAL_ENABLE_FT_CR == 1 /* Only unlink the file if we are *not* restarting. If we are restarting the file will be unlinked at a later time. */ if (OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state && OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) { - unlink(sm_module->sm_common_mmap->map_path); + unlink(sm_module->sm_common_module->module_seg_path); } #else - unlink(sm_module->sm_common_mmap->map_path); + unlink(sm_module->sm_common_module->module_seg_path); #endif } - OBJ_RELEASE(sm_module->sm_common_mmap); - sm_module->sm_common_mmap = NULL; + OBJ_RELEASE(sm_module->sm_common_module); + sm_module->sm_common_module = NULL; } } @@ -178,8 +180,8 @@ int mca_mpool_sm_ft_event(int state) { self_sm_module = (mca_mpool_sm_module_t*) self_module; /* Mark the old sm file for eventual removal via CRS */ - if (NULL != self_sm_module->sm_common_mmap) { - opal_crs_base_cleanup_append(self_sm_module->sm_common_mmap->map_path, false); + if (NULL != self_sm_module->sm_common_module) { + opal_crs_base_cleanup_append(self_sm_module->sm_common_module->module_seg_path, false); } /* Remove self from the list of all modules */ @@ -193,8 +195,8 @@ int mca_mpool_sm_ft_event(int state) { self_sm_module = (mca_mpool_sm_module_t*) self_module; /* Mark the old sm file for eventual removal via CRS */ - if (NULL != self_sm_module->sm_common_mmap) { - opal_crs_base_cleanup_append(self_sm_module->sm_common_mmap->map_path, false); + if (NULL != self_sm_module->sm_common_module) { + opal_crs_base_cleanup_append(self_sm_module->sm_common_module->module_seg_path, false); } /* Remove self from the list of all modules */