2012-08-16 23:11:35 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
|
|
|
#include "ompi/constants.h"
|
|
|
|
#include "ompi/communicator/communicator.h"
|
|
|
|
#include "ompi/mca/mpool/base/base.h"
|
|
|
|
#include "ompi/mca/bcol/bcol.h"
|
|
|
|
#include "ompi/mca/bcol/base/base.h"
|
|
|
|
#include "ompi/mca/coll/ml/coll_ml.h"
|
|
|
|
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
|
|
|
|
#include "bcol_basesmuma.h"
|
|
|
|
/*
|
|
|
|
* Public string showing the coll ompi_sm V2 component version number
|
|
|
|
*/
|
|
|
|
const char *mca_bcol_basesmuma_component_version_string =
|
|
|
|
"Open MPI bcol - basesmuma collective MCA component version " OMPI_VERSION;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Local functions
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int basesmuma_open(void);
|
|
|
|
static int basesmuma_close(void);
|
|
|
|
static int mca_bcol_basesmuma_deregister_ctl_sm(
|
|
|
|
mca_bcol_basesmuma_component_t *bcol_component);
|
|
|
|
|
|
|
|
|
|
|
|
static inline int mca_bcol_basesmuma_param_register_int(
|
|
|
|
const char* param_name, int default_value)
|
|
|
|
{
|
2012-10-24 22:38:44 +04:00
|
|
|
int id = mca_base_param_register_int("bcol","basesmuma",param_name,NULL,default_value);
|
|
|
|
int param_value = default_value;
|
|
|
|
mca_base_param_lookup_int(id,¶m_value);
|
2012-08-16 23:11:35 +04:00
|
|
|
return param_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Instantiate the public struct with all of our public information
|
|
|
|
* and pointers to our public functions in it
|
|
|
|
*/
|
|
|
|
|
|
|
|
mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component = {
|
|
|
|
|
|
|
|
/* First, fill in the super */
|
|
|
|
|
|
|
|
{
|
|
|
|
/* First, the mca_component_t struct containing meta
|
|
|
|
information about the component itself */
|
|
|
|
|
|
|
|
{
|
|
|
|
MCA_BCOL_BASE_VERSION_2_0_0,
|
|
|
|
|
|
|
|
/* Component name and version */
|
|
|
|
|
|
|
|
"basesmuma",
|
|
|
|
OMPI_MAJOR_VERSION,
|
|
|
|
OMPI_MINOR_VERSION,
|
|
|
|
OMPI_RELEASE_VERSION,
|
|
|
|
|
|
|
|
/* Component open and close functions */
|
|
|
|
|
|
|
|
basesmuma_open,
|
|
|
|
basesmuma_close,
|
|
|
|
},
|
|
|
|
|
|
|
|
/* Initialization / querying functions */
|
|
|
|
|
|
|
|
mca_bcol_basesmuma_init_query,
|
|
|
|
mca_bcol_basesmuma_comm_query,
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
0, /* (default) priority */
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open the component
|
|
|
|
*/
|
|
|
|
static int basesmuma_open(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
/* local variables */
|
|
|
|
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
|
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
opal_mutex_t *mutex_ptr;
|
|
|
|
int dummy;
|
|
|
|
|
|
|
|
/* set component priority */
|
|
|
|
cs->super.priority=
|
|
|
|
mca_bcol_basesmuma_param_register_int("priority",90);
|
|
|
|
|
|
|
|
/* set control region size (bytes), per proc */
|
|
|
|
cs->basesmuma_ctl_size_per_proc=
|
|
|
|
|
|
|
|
mca_bcol_basesmuma_param_register_int("basesmuma_ctl_size_per_proc",
|
|
|
|
CACHE_LINE_SIZE);
|
|
|
|
|
|
|
|
/* set control region alignment (bytes) */
|
|
|
|
cs->basesmuma_ctl_alignment=
|
|
|
|
mca_bcol_basesmuma_param_register_int("basesmuma_ctl_alignment",
|
|
|
|
getpagesize());
|
|
|
|
|
|
|
|
/* Number of memory banks */
|
|
|
|
cs->basesmuma_num_mem_banks=
|
|
|
|
mca_bcol_basesmuma_param_register_int("basesmuma_num_ctl_banks",
|
|
|
|
2);
|
|
|
|
|
|
|
|
/* Number of regions per memory bank */
|
|
|
|
cs->basesmuma_num_regions_per_bank=
|
|
|
|
mca_bcol_basesmuma_param_register_int("basesmuma_num_buffs_per_bank",
|
|
|
|
16);
|
|
|
|
|
|
|
|
/* number of polling loops to allow pending resources to
|
|
|
|
* complete their work
|
|
|
|
*/
|
|
|
|
cs->n_poll_loops=
|
|
|
|
mca_bcol_basesmuma_param_register_int("n_poll_loops",4);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure that the number of banks is a power of 2
|
|
|
|
*/
|
|
|
|
cs->basesmuma_num_mem_banks=
|
|
|
|
roundup_to_power_radix(2,cs->basesmuma_num_mem_banks, &dummy);
|
|
|
|
if ( 0 == cs->basesmuma_num_mem_banks ) {
|
|
|
|
ret=OMPI_ERROR;
|
|
|
|
goto ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure that the the number of buffers is a power of 2
|
|
|
|
*/
|
|
|
|
cs->basesmuma_num_regions_per_bank=
|
|
|
|
roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank, &dummy);
|
|
|
|
if ( 0 == cs->basesmuma_num_regions_per_bank ) {
|
|
|
|
ret=OMPI_ERROR;
|
|
|
|
goto ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Number of groups supported */
|
|
|
|
cs->n_groups_supported=
|
|
|
|
mca_bcol_basesmuma_param_register_int("n_groups_supported",100);
|
|
|
|
|
|
|
|
/* order of fanin tree */
|
|
|
|
cs->radix_fanin=
|
|
|
|
mca_bcol_basesmuma_param_register_int("radix_fanin",2);
|
|
|
|
|
|
|
|
/* order of fanout tree */
|
|
|
|
cs->radix_fanout=
|
|
|
|
mca_bcol_basesmuma_param_register_int("radix_fanout",2);
|
|
|
|
|
|
|
|
/* order of read tree */
|
|
|
|
cs->radix_read_tree =
|
|
|
|
mca_bcol_basesmuma_param_register_int("radix_read_tree",3);
|
|
|
|
|
|
|
|
/* order of reduction fanout tree */
|
|
|
|
cs->order_reduction_tree=
|
|
|
|
mca_bcol_basesmuma_param_register_int("order_reduction_tree",2);
|
|
|
|
|
|
|
|
/* k-nomial radix */
|
|
|
|
cs->k_nomial_radix=
|
|
|
|
mca_bcol_basesmuma_param_register_int("k_nomial_radix",3);
|
|
|
|
|
|
|
|
/* number of polling loops for non-blocking algorithms */
|
|
|
|
cs->num_to_probe =
|
|
|
|
mca_bcol_basesmuma_param_register_int("num_to_probe",10);
|
|
|
|
|
|
|
|
/* radix of the k-ary scatter tree */
|
|
|
|
cs->scatter_kary_radix =
|
|
|
|
mca_bcol_basesmuma_param_register_int("scatter_kary_radix",4);
|
|
|
|
|
|
|
|
/* Portals initialization */
|
|
|
|
cs->portals_init = false;
|
|
|
|
cs->portals_info = NULL;
|
|
|
|
|
|
|
|
cs->verbose =
|
|
|
|
mca_bcol_basesmuma_param_register_int("verbose",0);
|
|
|
|
|
|
|
|
/* register parmeters controlling message fragementation */
|
|
|
|
cs->super.min_frag_size=
|
|
|
|
mca_bcol_basesmuma_param_register_int("min_frag_size",getpagesize());
|
|
|
|
cs->super.max_frag_size=
|
|
|
|
mca_bcol_basesmuma_param_register_int("max_frag_size",FRAG_SIZE_NO_LIMIT);
|
|
|
|
/* by default use pre-registered shared memory segments */
|
|
|
|
/* RLG NOTE: When we have a systematic way to handle single memory
|
|
|
|
* copy semantics, we need to update this logic
|
|
|
|
*/
|
|
|
|
cs->super.can_use_user_buffers=
|
|
|
|
mca_bcol_basesmuma_param_register_int("can_use_user_buffers",0);
|
|
|
|
cs->super.use_pipeline=
|
|
|
|
mca_bcol_basesmuma_param_register_int("use_pipeline",1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialization
|
|
|
|
*/
|
|
|
|
cs->sm_ctl_structs=NULL;
|
|
|
|
OBJ_CONSTRUCT(&(cs->sm_connections_list),opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&(cs->nb_admin_barriers),opal_list_t);
|
|
|
|
mutex_ptr= &(cs->nb_admin_barriers_mutex);
|
|
|
|
OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
|
|
|
|
|
|
|
|
/* Control structures object construct
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&(cs->ctl_structures), opal_list_t);
|
|
|
|
|
|
|
|
/* shared memory has not been registered yet */
|
|
|
|
cs->mpool_inited = false;
|
|
|
|
|
|
|
|
/* initialize base file names */
|
|
|
|
cs->clt_base_fname="sm_ctl_mem_";
|
|
|
|
cs->payload_base_fname="sm_payload_mem_";
|
|
|
|
|
|
|
|
/* initialize the size of the shared memory scartch region */
|
|
|
|
cs->my_scratch_shared_memory_size=getpagesize();
|
|
|
|
cs->my_scratch_shared_memory=NULL;
|
|
|
|
cs->scratch_offset_from_base_ctl_file=0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* register the progess function
|
|
|
|
*/
|
|
|
|
ret=opal_progress_register(bcol_basesmuma_progress);
|
|
|
|
if (MPI_SUCCESS != ret) {
|
|
|
|
opal_output(0, "failed to register the progress function\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ERROR:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* release the control structure backing file
|
|
|
|
*/
|
|
|
|
static int mca_bcol_basesmuma_deregister_ctl_sm(
|
|
|
|
mca_bcol_basesmuma_component_t *bcol_component)
|
|
|
|
{
|
|
|
|
|
|
|
|
/* local variables */
|
|
|
|
int ret;
|
|
|
|
bcol_basesmuma_smcm_mmap_t *sm_ctl_structs;
|
|
|
|
|
|
|
|
/* get a handle on the backing file */
|
|
|
|
sm_ctl_structs=bcol_component->sm_ctl_structs;
|
|
|
|
/* Nothing to free */
|
|
|
|
if (!sm_ctl_structs){
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unmap the shared memory file */
|
|
|
|
ret=munmap((void *) sm_ctl_structs->map_addr, sm_ctl_structs->map_size);
|
|
|
|
if( 0 > ret) {
|
|
|
|
fprintf(stderr,"Failed to munmap the shared memory file %s \n",
|
|
|
|
sm_ctl_structs->map_path);
|
|
|
|
fflush(stderr);
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set the pointer to NULL */
|
|
|
|
/*sm_ctl_structs->map_addr = NULL;*/
|
|
|
|
|
|
|
|
/* remove the file */
|
|
|
|
/*ret = remove(sm_ctl_structs->map_path);*/
|
|
|
|
if( 0 > ret) {
|
|
|
|
fprintf(stderr,"Failed to remove the shared memory file %s \n",
|
|
|
|
sm_ctl_structs->map_path);
|
|
|
|
perror("Failed to remove the shared memory file");
|
|
|
|
fflush(stderr);
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Close the component
|
|
|
|
*/
|
|
|
|
static int basesmuma_close(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
bcol_basesmuma_registration_data_t *net_ctx;
|
|
|
|
bcol_base_network_context_t *net_reg;
|
|
|
|
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
|
|
|
|
|
|
|
/* gvm Leak FIX */
|
|
|
|
while(!opal_list_is_empty(&(cs->ctl_structures))) {
|
|
|
|
opal_list_item_t *item;
|
|
|
|
item = opal_list_remove_first(&(cs->ctl_structures));
|
|
|
|
OBJ_DESTRUCT(item);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&(cs->ctl_structures));
|
|
|
|
|
|
|
|
|
|
|
|
/* deregister the progress function */
|
|
|
|
ret=opal_progress_unregister(bcol_basesmuma_progress);
|
|
|
|
if (MPI_SUCCESS != ret) {
|
|
|
|
opal_output(0, "failed to unregister the progress function\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove the control structure backing file */
|
|
|
|
ret=mca_bcol_basesmuma_deregister_ctl_sm(&mca_bcol_basesmuma_component);
|
|
|
|
if (MPI_SUCCESS != ret) {
|
|
|
|
opal_output(0, "failed to remove control structure backing file\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove the network contexts - only one network context defined for
|
|
|
|
* this component.
|
|
|
|
*/
|
|
|
|
/* file_name returne by asprintf, so need to free the resource */
|
|
|
|
if(mca_bcol_basesmuma_component.super.network_contexts ) {
|
|
|
|
net_reg=(bcol_base_network_context_t *)
|
|
|
|
mca_bcol_basesmuma_component.super.network_contexts[0];
|
|
|
|
if(net_reg) {
|
|
|
|
net_ctx=(bcol_basesmuma_registration_data_t *)net_reg->context_data;
|
|
|
|
if( net_ctx) {
|
|
|
|
if(net_ctx->file_name) {
|
|
|
|
free(net_ctx->file_name);
|
|
|
|
}
|
|
|
|
free(net_ctx);
|
|
|
|
}
|
|
|
|
free(net_reg);
|
|
|
|
}
|
|
|
|
free(mca_bcol_basesmuma_component.super.network_contexts);
|
|
|
|
mca_bcol_basesmuma_component.super.network_contexts=NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* normal return */
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* query to see if the component is available for use, and can
|
|
|
|
* satisfy the thread and progress requirements
|
|
|
|
*/
|
|
|
|
int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads)
|
|
|
|
{
|
|
|
|
/* done */
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This routine is used to allocate shared memory for the the shared
|
|
|
|
* memory control regions.
|
|
|
|
*/
|
|
|
|
int mca_bcol_basesmuma_allocate_sm_ctl_memory(mca_bcol_basesmuma_component_t *cs)
|
|
|
|
{
|
|
|
|
/* local variables */
|
|
|
|
int name_length, ret;
|
|
|
|
size_t ctl_length;
|
|
|
|
char *name, *ctl_mem;
|
|
|
|
|
|
|
|
/* set the file name */
|
|
|
|
name_length=asprintf(&name,
|
|
|
|
"%s"OPAL_PATH_SEP"%s""%0d",
|
|
|
|
orte_process_info.job_session_dir,
|
|
|
|
cs->clt_base_fname,
|
|
|
|
(int)getpid());
|
|
|
|
if( 0 > name_length ) {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
/* make sure name is not too long */
|
|
|
|
if ( OPAL_PATH_MAX < (name_length-1) ) {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* compute segment length */
|
|
|
|
|
|
|
|
ctl_length=(cs->basesmuma_num_mem_banks*
|
|
|
|
cs->basesmuma_num_regions_per_bank+cs->basesmuma_num_mem_banks)
|
|
|
|
*sizeof(mca_bcol_basesmuma_ctl_struct_t)*cs->n_groups_supported;
|
|
|
|
/* need two banks of memory per group - for algorithms that have
|
|
|
|
* user payload, and those that don't
|
|
|
|
*/
|
|
|
|
ctl_length*=2;
|
|
|
|
|
|
|
|
/* add space for internal library management purposes */
|
|
|
|
ctl_length+=cs->my_scratch_shared_memory_size;
|
|
|
|
|
|
|
|
/* round up to multiple of page size */
|
|
|
|
ctl_length=(ctl_length-1)/getpagesize()+1;
|
|
|
|
ctl_length*=getpagesize();
|
|
|
|
|
|
|
|
/* allocate memory that will be mmaped */
|
|
|
|
ctl_mem=(char *)valloc(ctl_length);
|
|
|
|
if( !ctl_mem) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate the shared file */
|
|
|
|
cs->sm_ctl_structs=bcol_basesmuma_smcm_mem_reg(ctl_mem,
|
|
|
|
ctl_length,getpagesize(),name);
|
|
|
|
if( !cs->sm_ctl_structs) {
|
|
|
|
fprintf(stderr," In mca_bcol_basesmuma_allocate_sm_ctl_memory failed to allocathe backing file %s \n",name);
|
|
|
|
ret=ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto Error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* free the memory allocated by asprintf for the file name -
|
|
|
|
* in mca_base_smcm_mem_reg this name is copied into a new
|
|
|
|
* memory location */
|
|
|
|
free(name);
|
|
|
|
|
|
|
|
/* successful return */
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
|
|
|
|
Error:
|
|
|
|
if(name) {
|
|
|
|
free(name);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|