diff --git a/ompi/mca/coll/sm2/coll_sm2.h b/ompi/mca/coll/sm2/coll_sm2.h index 63f1ad019e..da4be78302 100644 --- a/ompi/mca/coll/sm2/coll_sm2.h +++ b/ompi/mca/coll/sm2/coll_sm2.h @@ -67,7 +67,38 @@ BEGIN_C_DECLS mca_coll_base_component_1_1_0_t super; /** MCA parameter: Priority of this component */ - int sm_priority; + int sm2_priority; + + /** MCA parameter: control region size (bytes), per proc */ + size_t sm2_ctl_size_per_proc; + + /** MCA parameter: control region size (bytes) actually allocated - per proc*/ + size_t sm2_ctl_size_allocated; + + /** MCA parameter: control region alignment */ + size_t sm2_ctl_alignment; + + /** MCA parameter: Max data Segment size */ + size_t sm2_max_data_seg_size; + + /** MCA parameter: Min data Segment size */ + size_t sm2_data_seg_size; + + /** MCA parameter: control data size (bytes) actually allocated - per proc*/ + size_t sm2_data_size_allocated; + + /** MCA parameter: data region alignment */ + size_t sm2_data_alignment; + + /** MCA parameter: number of memory banks */ + size_t sm2_num_mem_banks; + + /** MCA parameter: number of regions per memory bank */ + size_t sm2_num_regions_per_bank; + + /* size of shared memory backing file */ + size_t size_sm2_backing_file; + }; /** diff --git a/ompi/mca/coll/sm2/coll_sm2_component.c b/ompi/mca/coll/sm2/coll_sm2_component.c index cf28323e4c..7f16068c84 100644 --- a/ompi/mca/coll/sm2/coll_sm2_component.c +++ b/ompi/mca/coll/sm2/coll_sm2_component.c @@ -23,12 +23,14 @@ */ #include "ompi_config.h" +#include #include "ompi/constants.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/coll.h" #include "opal/util/show_help.h" #include "coll_sm2.h" +#include "ompi/mca/coll/base/base.h" /* @@ -51,6 +53,31 @@ static int sm2_module_enable(struct mca_coll_base_module_1_1_0_t *module, static int sm2_open(void); static int sm2_close(void); +static bool have_local_peers(ompi_group_t *group, size_t size) +{ + size_t i; + ompi_proc_t *proc; + + for (i = 0; i < size; ++i) { + proc = ompi_group_peer_lookup(group,i); + if (0 == (proc->proc_flags & OMPI_PROC_FLAG_LOCAL)) { + return false; + } + } + return true; +} + +static inline int mca_coll_sm2_param_register_int( + const char* param_name, int default_value) +{ + int id = mca_base_param_register_int("coll","sm2",param_name,NULL,default_value); + int param_value = default_value; + mca_base_param_lookup_int(id,¶m_value); + return param_value; +} + + + /* * Instantiate the public struct with all of our public information @@ -130,14 +157,47 @@ mca_coll_sm2_module_destruct(mca_coll_sm2_module_t *module) */ static int sm2_open(void) { - mca_base_component_t *c = &mca_coll_sm2_component.super.collm_version; + /* local variables */ + int size; mca_coll_sm2_component_t *cs = &mca_coll_sm2_component; - mca_base_param_reg_int(c, "priority", - "Priority of the sm-v2 coll component", - false, false, - cs->sm_priority, - &cs->sm_priority); + /* set component priority */ + cs->sm2_priority= + mca_coll_sm2_param_register_int("sm_priority",0); + + /* set control region size (bytes), per proc */ + cs->sm2_ctl_size_per_proc= + mca_coll_sm2_param_register_int("sm2_ctl_size_per_proc",sizeof(int)); + + /* initialize control region allocted */ + cs->sm2_ctl_size_allocated=0; + + /* set control region alignment (bytes) */ + cs->sm2_ctl_alignment= + mca_coll_sm2_param_register_int("sm2_ctl_alignment",getpagesize()); + + /* Min data Segment size (bytes) - per proc */ + cs->sm2_data_seg_size= + mca_coll_sm2_param_register_int("sm2_data_seg_size",0); + + /* Max data Segment size (bytes) - per proc */ + cs->sm2_max_data_seg_size= + mca_coll_sm2_param_register_int("sm2_max_data_seg_size",8*getpagesize()); + + /* initialize control region allocted */ + cs->sm2_data_size_allocated=0; + + /* Data region alignment (bytes) - per proc */ + cs->sm2_data_alignment= + mca_coll_sm2_param_register_int("sm2_data_alignment",CACHE_LINE_SIZE); + + /* Number of memory banks */ + cs->sm2_num_mem_banks= + mca_coll_sm2_param_register_int("sm2_num_mem_banks",2); + + /* Number of regions per memory bank */ + cs->sm2_num_regions_per_bank= + mca_coll_sm2_param_register_int("sm2_num_regions_per_bank",8); return OMPI_SUCCESS; } @@ -157,6 +217,7 @@ static int sm2_close(void) int mca_coll_sm2_init_query(bool enable_progress_threads, bool enable_mpi_threads) { + /* at this stage there is no reason to disaulify this component */ /* done */ return OMPI_SUCCESS; @@ -164,16 +225,49 @@ int mca_coll_sm2_init_query(bool enable_progress_threads, /* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. + * communicator, and if so, what it's priority is. This is where + * the backing shared-memory file is created. */ struct mca_coll_base_module_1_1_0_t * mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority) { /* local variables */ mca_coll_sm2_module_t *sm_module; + size_t coll_sm2_comm_backing_file_size=0; + size_t coll_sm2_per_proc_segment_size=0; + int group_size; + size_t alignment,size,size_tot,size_tot_per_proc_per_seg; + size_t tot_size_per_bank,size_tot_per_segment; + size_t tot_size_mem_banks; + size_t ctl_memory_per_proc_per_segment; + size_t mem_management_per_proc_per_block; + size_t mem_management_per_proc; + size_t mem_management_total; + size_t size_sm2_backing_file; + + /* + * This is activated only for intra-communicators + */ + if (OMPI_COMM_IS_INTER(comm) ) { + return NULL; + } + + /* + * Use only if more than on proc in the communicator + */ + if (1 == ompi_comm_size(comm) ) { + return NULL; + } + + /* check to see if all procs are on the same node, and therefore + * can communicate using shared memory + */ + if ( !have_local_peers(comm->c_local_group, ompi_comm_size(comm))) { + return NULL; + } /* Get our priority */ - *priority = mca_coll_sm2_component.sm_priority; + *priority = mca_coll_sm2_component.sm2_priority; /* allocate and initialize an sm-v2 module */ sm_module = OBJ_NEW(mca_coll_sm2_module_t); @@ -196,6 +290,87 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority) sm_module->super.coll_scan = NULL; sm_module->super.coll_scatter = NULL; sm_module->super.coll_scatterv = NULL; + + /* + * create backing file + */ + + /* + * set group size + */ + group_size=ompi_comm_size(comm); + + /* + * get control region size + */ + /* just enough place for one flag per process */ + ctl_memory_per_proc_per_segment=sizeof(int); + if( mca_coll_sm2_component.sm2_ctl_size_per_proc > ctl_memory_per_proc_per_segment ) + ctl_memory_per_proc_per_segment=mca_coll_sm2_component.sm2_ctl_size_per_proc; + ctl_memory_per_proc_per_segment=ctl_memory_per_proc_per_segment * group_size ; + + /* pad this up to the alignment needed by the data segment, as the + * that data segment will directly follow the control segment in + * memory. + */ + alignment=mca_coll_sm2_component.sm2_data_alignment; + ctl_memory_per_proc_per_segment= + (alignment + ctl_memory_per_proc_per_segment -1) / alignment; + ctl_memory_per_proc_per_segment*=alignment; + mca_coll_sm2_component.sm2_ctl_size_allocated=ctl_memory_per_proc_per_segment; + + /* get data region size - allocation happens on a page granularity, with + * a minimum of a page allocated per proc, so adjust to this + */ + size=mca_coll_sm2_component.sm2_data_seg_size; + if( size > mca_coll_sm2_component.sm2_max_data_seg_size ) + size=mca_coll_sm2_component.sm2_max_data_seg_size; + size_tot_per_proc_per_seg=size+ mca_coll_sm2_component.sm2_ctl_size_allocated; + if( size_tot_per_proc_per_seg < getpagesize()) + size_tot_per_proc_per_seg=getpagesize(); + /* round this up to the nearest integer page-size multiple */ + size_tot_per_proc_per_seg= ( size_tot_per_proc_per_seg + getpagesize() - 1)/ + getpagesize(); + size_tot_per_proc_per_seg*=getpagesize(); + + /* compute segment memory needed */ + size_tot_per_segment=group_size * size_tot_per_proc_per_seg ; + + /* compute memory per bank */ + tot_size_per_bank=size_tot_per_segment*mca_coll_sm2_component.sm2_num_regions_per_bank; + + /* compute total memory in the memory banks */ + tot_size_mem_banks=tot_size_per_bank*mca_coll_sm2_component.sm2_num_mem_banks; + + /* compute the amount of memory needed for the anynchromous barriers used to + * manage the memory resources. + */ + /* for each bank, 2 sets of barrier buffers */ + mem_management_per_proc_per_block= 2 * CACHE_LINE_SIZE ; + /* add in number of banks */ + mem_management_per_proc= mem_management_per_proc_per_block * + mca_coll_sm2_component.sm2_num_mem_banks; + /* round up to page multiples */ + mem_management_per_proc=(mem_management_per_proc + + getpagesize() -1 ) / getpagesize(); + mem_management_per_proc*=getpagesize(); + + /* total memory management required */ + mem_management_total=mem_management_per_proc * group_size; + + /* total size of backing file */ + size_sm2_backing_file=mem_management_total+tot_size_mem_banks; + + /* allocate backing file */ + + /* initialize local counters */ + + /* set pointers */ + + /* touch pages to apply memory affinity - Note: do we really need this or will + * the algorithms do this */ + + /* return */ return &(sm_module->super); } @@ -207,6 +382,15 @@ static int sm2_module_enable(struct mca_coll_base_module_1_1_0_t *module, struct ompi_communicator_t *comm) { + /* local variables */ + char output_buffer[2*MPI_MAX_OBJECT_NAME]; + + memset(&output_buffer[0],0,sizeof(output_buffer)); + snprintf(output_buffer,sizeof(output_buffer),"%s (cid %d)", comm->c_name, + comm->c_contextid); + opal_output_verbose(10, mca_coll_base_output, + "coll:sm2:enable: new communicator: %s", output_buffer); + /* All done */ return OMPI_SUCCESS; }