/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /* ASSUMING local process homogeneity with respect to all utilized shared memory * facilities. that is, if one local process deems a particular shared memory * facility acceptable, then ALL local processes should be able to utilize that * facility. as it stands, this is an important point because one process * dictates to all other local processes which common sm component will be * selected based on its own, local run-time test. */ /* RML Messaging in common sm and Our Assumptions * o MPI_Init is single threaded * o this routine will not be called after MPI_Init. * * if these assumptions ever change, then we may need to add some support code * that queues up RML messages that have arrived, but have not yet been * consumed by the thread who is looking to complete its component * initialization. */ #include "ompi_config.h" #include "opal/align.h" #include "opal/util/argv.h" #if OPAL_ENABLE_FT_CR == 1 #include "opal/runtime/opal_cr.h" #endif #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" #include "ompi/constants.h" #include "ompi/mca/dpm/dpm.h" #include "ompi/mca/mpool/sm/mpool_sm.h" #include "common_sm_rml.h" OBJ_CLASS_INSTANCE( mca_common_sm_module_t, opal_list_item_t, NULL, NULL ); /* shared memory information used for initialization and setup. */ static opal_shmem_ds_t shmem_ds; /* ////////////////////////////////////////////////////////////////////////// */ /* static utility functions */ /* ////////////////////////////////////////////////////////////////////////// */ /* ////////////////////////////////////////////////////////////////////////// */ static mca_common_sm_module_t * attach_and_init(size_t size_ctl_structure, size_t data_seg_alignment, bool first_call) { mca_common_sm_module_t *map = NULL; mca_common_sm_seg_header_t *seg = NULL; unsigned char *addr = NULL; /* map the file and initialize segment state */ if (NULL == (seg = (mca_common_sm_seg_header_t *) opal_shmem_segment_attach(&shmem_ds))) { return NULL; } opal_atomic_rmb(); /* set up the map object */ if (NULL == (map = OBJ_NEW(mca_common_sm_module_t))) { ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); return NULL; } /* copy information: from ====> to */ opal_shmem_ds_copy(&shmem_ds, &map->shmem_ds); /* the first entry in the file is the control structure. the first * entry in the control structure is an mca_common_sm_seg_header_t * element */ map->module_seg = seg; addr = ((unsigned char *)seg) + size_ctl_structure; /* if we have a data segment (i.e., if 0 != data_seg_alignment), * then make it the first aligned address after the control * structure. IF THIS HAPPENS, THIS IS A PROGRAMMING ERROR IN * OPEN MPI! */ if (0 != data_seg_alignment) { addr = OPAL_ALIGN_PTR(addr, data_seg_alignment, unsigned char *); /* is addr past end of the shared memory segment? */ if ((unsigned char *)seg + shmem_ds.seg_size < addr) { orte_show_help("help-mpi-common-sm.txt", "mmap too small", 1, orte_process_info.nodename, (unsigned long)shmem_ds.seg_size, (unsigned long)size_ctl_structure, (unsigned long)data_seg_alignment); return NULL; } } map->module_data_addr = addr; map->module_seg_addr = (unsigned char *)seg; /* map object successfully initialized - we can safely increment * seg_num_procs_attached_and_inited. this value is used by * opal_shmem_unlink. */ if (first_call) { /* make sure that the first call to this function initializes * seg_num_procs_inited to zero */ map->module_seg->seg_num_procs_inited = 0; opal_atomic_wmb(); } (void)opal_atomic_add_size_t(&map->module_seg->seg_num_procs_inited, 1); opal_atomic_wmb(); return map; } /* ////////////////////////////////////////////////////////////////////////// */ mca_common_sm_module_t * mca_common_sm_init(ompi_proc_t **procs, size_t num_procs, size_t size, char *file_name, size_t size_ctl_structure, size_t data_seg_alignment) { /* indicates whether or not i'm the lowest named process */ bool lowest_local_proc = false; mca_common_sm_module_t *map = NULL; ompi_proc_t *temp_proc = NULL; bool found_lowest = false; size_t num_local_procs = 0, p = 0; /* o reorder procs array to have all the local procs at the beginning. * o look for the local proc with the lowest name. * o determine the number of local procs. * o ensure that procs[0] is the lowest named process. */ for (p = 0; p < num_procs; ++p) { if (OPAL_PROC_ON_LOCAL_NODE(procs[p]->proc_flags)) { /* if we don't have a lowest, save the first one */ if (!found_lowest) { procs[0] = procs[p]; found_lowest = true; } else { /* save this proc */ procs[num_local_procs] = procs[p]; /* if we have a new lowest, swap it with position 0 * so that procs[0] is always the lowest named proc */ if (OPAL_VALUE2_GREATER == orte_util_compare_name_fields( ORTE_NS_CMP_ALL, &(procs[p]->proc_name), &(procs[0]->proc_name))) { temp_proc = procs[0]; procs[0] = procs[p]; procs[num_local_procs] = temp_proc; } } /* regardless of the comparisons above, we found * another proc on the local node, so increment */ ++num_local_procs; } } /* if there is less than 2 local processes, there's nothing to do. */ if (num_local_procs < 2) { return NULL; } /* determine whether or not i am the lowest local process */ lowest_local_proc = (0 == orte_util_compare_name_fields( ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, &(procs[0]->proc_name))); /* figure out if i am the lowest rank in the group. * if so, i will create the shared memory backing store */ if (lowest_local_proc) { if (OPAL_SUCCESS == opal_shmem_segment_create(&shmem_ds, file_name, size)) { map = attach_and_init(size_ctl_structure, data_seg_alignment, true); if (NULL != map) { size_t mem_offset = map->module_data_addr - (unsigned char *)map->module_seg; map->module_seg->seg_offset = mem_offset; map->module_seg->seg_size = size - mem_offset; opal_atomic_init(&map->module_seg->seg_lock, OPAL_ATOMIC_UNLOCKED); map->module_seg->seg_inited = 0; } else { /* fail! * only invalidate the shmem_ds. doing so will let the rest * of the local processes know that the lowest local rank * failed to properly initialize the shared memory segment, so * they should try to carry on without shared memory support */ OPAL_SHMEM_DS_INVALIDATE(&shmem_ds); } } } /* send shmem info to the rest of the local procs. */ if (OMPI_SUCCESS != mca_common_sm_rml_info_bcast(&shmem_ds, procs, num_local_procs, OMPI_RML_TAG_SM_BACK_FILE_CREATED, lowest_local_proc, file_name)) { goto out; } /* are we dealing with a valid shmem_ds? that is, did the lowest * process successfully initialize the shared memory segment? */ if (OPAL_SHMEM_DS_IS_VALID(&shmem_ds)) { if (!lowest_local_proc) { map = attach_and_init(size_ctl_structure, data_seg_alignment, false); } else { /* wait until every other participating process has attached to the * shared memory segment. */ while (num_local_procs > map->module_seg->seg_num_procs_inited) { opal_atomic_rmb(); } opal_shmem_unlink(&shmem_ds); } } out: return map; } /* ////////////////////////////////////////////////////////////////////////// */ /** * this routine is the same as mca_common_sm_mmap_init() except that * it takes an (ompi_group_t *) parameter to specify the peers rather * than an array of procs. unlike mca_common_sm_mmap_init(), the * group must contain *only* local peers, or this function will return * NULL and not create any shared memory segment. */ mca_common_sm_module_t * mca_common_sm_init_group(ompi_group_t *group, size_t size, char *file_name, size_t size_ctl_structure, size_t data_seg_alignment) { mca_common_sm_module_t *ret = NULL; ompi_proc_t **procs = NULL; size_t i; size_t group_size; ompi_proc_t *proc; /* if there is less than 2 procs, there's nothing to do */ if ((group_size = ompi_group_size(group)) < 2) { goto out; } else if (NULL == (procs = (ompi_proc_t **) malloc(sizeof(ompi_proc_t *) * group_size))) { ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); goto out; } /* make sure that all the procs in the group are local */ for (i = 0; i < group_size; ++i) { proc = ompi_group_peer_lookup(group, i); if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { goto out; } procs[i] = proc; } /* let mca_common_sm_init take care of the rest ... */ ret = mca_common_sm_init(procs, group_size, size, file_name, size_ctl_structure, data_seg_alignment); out: if (NULL != procs) { free(procs); } return ret; } /* ////////////////////////////////////////////////////////////////////////// */ /** * allocate memory from a previously allocated shared memory * block. * * @param size size of request, in bytes (IN) * * @retval addr virtual address */ void * mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, size_t *size, mca_mpool_base_registration_t **registration) { mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t *)mpool; mca_common_sm_seg_header_t* seg = sm_module->sm_common_module->module_seg; void *addr; opal_atomic_lock(&seg->seg_lock); if (seg->seg_offset + *size > seg->seg_size) { addr = NULL; } else { size_t fixup; /* add base address to segment offset */ addr = sm_module->sm_common_module->module_data_addr + seg->seg_offset; seg->seg_offset += *size; /* fix up seg_offset so next allocation is aligned on a * sizeof(long) boundry. Do it here so that we don't have to * check before checking remaining size in buffer */ if ((fixup = (seg->seg_offset & (sizeof(long) - 1))) > 0) { seg->seg_offset += sizeof(long) - fixup; } } if (NULL != registration) { *registration = NULL; } opal_atomic_unlock(&seg->seg_lock); return addr; } /* ////////////////////////////////////////////////////////////////////////// */ int mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module) { int rc = OMPI_SUCCESS; if (NULL != mca_common_sm_module->module_seg) { if (OPAL_SUCCESS != opal_shmem_segment_detach(&mca_common_sm_module->shmem_ds)) { rc = OMPI_ERROR; } } return rc; }