1
1
openmpi/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c
2013-02-05 21:52:55 +00:00

582 строки
18 KiB
C

/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/patterns/net/netpatterns.h"
#include "opal/util/show_help.h"
#include "opal/align.h"
#include "bcol_basesmuma.h"
#include "bcol_basesmuma_utils.h"
static void
mca_bcol_basesmuma_module_construct(mca_bcol_basesmuma_module_t *module)
{
module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_basesmuma_component;
module->super.list_n_connected = NULL;
module->super.hier_scather_offset = 0;
}
static void
mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
{
/* local variables */
int i;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/*
* release allocated resrouces
*/
/* ...but not until you're sure you have no outstanding collectives */
while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) {
opal_progress();
}
/* collective topology data */
if( sm_module->fanout_read_tree) {
for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
if(0 < sm_module->fanout_read_tree[i].n_children ) {
free(sm_module->fanout_read_tree[i].children_ranks);
sm_module->fanout_read_tree[i].children_ranks=NULL;
}
}
free(sm_module->fanout_read_tree);
sm_module->fanout_read_tree=NULL;
}
/* gvm Leak FIX Reduction_tree[].children_ranks has
* to be removed. I don't how to get the size (which is
* size of subgroup) of array reduction_tree
*/
if( sm_module->reduction_tree) {
for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
if(0 < sm_module->reduction_tree[i].n_children ) {
free(sm_module->reduction_tree[i].children_ranks);
sm_module->reduction_tree[i].children_ranks=NULL;
}
}
free(sm_module->reduction_tree);
sm_module->reduction_tree=NULL;
}
/* gvm Leak FIX */
if (sm_module->fanout_node.children_ranks){
free(sm_module->fanout_node.children_ranks);
sm_module->fanout_node.children_ranks = NULL;
}
if (sm_module->fanin_node.children_ranks){
free(sm_module->fanin_node.children_ranks);
sm_module->fanin_node.children_ranks = NULL;
}
/* colls_no_user_data resrouces */
if(sm_module->colls_no_user_data.ctl_buffs_mgmt){
free(sm_module->colls_no_user_data.ctl_buffs_mgmt);
sm_module->colls_no_user_data.ctl_buffs_mgmt=NULL;
}
if(sm_module->colls_no_user_data.ctl_buffs){
free(sm_module->colls_no_user_data.ctl_buffs);
sm_module->colls_no_user_data.ctl_buffs=NULL;
}
/* colls_with_user_data resrouces */
if(sm_module->colls_with_user_data.ctl_buffs_mgmt){
free(sm_module->colls_with_user_data.ctl_buffs_mgmt);
sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL;
}
if(sm_module->colls_with_user_data.ctl_buffs){
free(sm_module->colls_with_user_data.ctl_buffs);
sm_module->colls_with_user_data.ctl_buffs=NULL;
}
if(sm_module->shared_memory_scratch_space) {
free(sm_module->shared_memory_scratch_space);
sm_module->shared_memory_scratch_space=NULL;
}
#if 1
if(sm_module->scatter_kary_tree) {
for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
if(0 < sm_module->scatter_kary_tree[i].n_children) {
free(sm_module->scatter_kary_tree[i].children_ranks);
sm_module->scatter_kary_tree[i].children_ranks=NULL;
}
}
free(sm_module->scatter_kary_tree);
}
#endif
if(NULL != sm_module->super.list_n_connected ){
free(sm_module->super.list_n_connected);
sm_module->super.list_n_connected = NULL;
}
/* free the k-nomial allgather tree here */
/* done */
}
static void bcol_basesmuma_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
{
mca_bcol_basesmuma_module_t *basesmuma_module =
(mca_bcol_basesmuma_module_t *) super;
size_t basesmuma_offset = bcol_basesmuma_data_offset_calc(basesmuma_module);
/* Set the Allreduce threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_ALLREDUCE] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
/* Set the Bcast threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_BCAST] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
/* Set the Gather threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_GATHER] =
(basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
/* Set the ALLgather threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_ALLGATHER] =
(basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
/* Set the Reduce threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_REDUCE] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
/* Set the Scatter threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_SCATTER] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
}
static void load_func_with_choices(mca_bcol_base_module_t *super)
{
int fnc;
/* Loading memory management and collective functions */
for (fnc=0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
super->bcol_function_init_table[fnc] = NULL;
}
super->bcol_function_init_table[BCOL_FANIN] = bcol_basesmuma_fanin_init;
super->bcol_function_init_table[BCOL_FANOUT] = bcol_basesmuma_fanout_init;
super->bcol_function_init_table[BCOL_BARRIER] = bcol_basesmuma_barrier_init;
super->bcol_function_init_table[BCOL_BCAST] = bcol_basesmuma_bcast_init;
super->bcol_function_init_table[BCOL_ALLREDUCE] = NULL;
super->bcol_function_init_table[BCOL_REDUCE] = NULL;
super->bcol_function_init_table[BCOL_GATHER] = NULL;
super->bcol_function_init_table[BCOL_ALLGATHER] = NULL;
super->bcol_function_init_table[BCOL_SYNC] = bcol_basesmuma_memsync_init;
/* memory management */
super->bcol_memory_init = bcol_basesmuma_bank_init_opti;
super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree;
/* Set thresholds */
super->set_small_msg_thresholds = bcol_basesmuma_set_small_msg_thresholds;
}
static int load_recursive_knomial_info(mca_bcol_basesmuma_module_t
*sm_module)
{
int rc = OMPI_SUCCESS;
rc = netpatterns_setup_recursive_knomial_tree_node(
sm_module->super.sbgp_partner_module->group_size,
sm_module->super.sbgp_partner_module->my_index,
mca_bcol_basesmuma_component.k_nomial_radix,
&sm_module->knomial_exchange_tree);
return rc;
}
int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super)
{
mca_bcol_basesmuma_module_t *sm_module = (mca_bcol_basesmuma_module_t *) super;
return netpatterns_setup_recursive_knomial_allgather_tree_node(
sm_module->super.sbgp_partner_module->group_size,
sm_module->super.sbgp_partner_module->my_index,
mca_bcol_basesmuma_component.k_nomial_radix,
super->list_n_connected,
&sm_module->knomial_allgather_tree);
}
/* query to see if the module is available for use on the given
* communicator, and if so, what it's priority is. This is where
* the backing shared-memory file is created.
*/
mca_bcol_base_module_t **
mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
{
/* local variables */
mca_bcol_base_module_t **sm_modules = NULL;
mca_bcol_basesmuma_module_t *sm_module;
bcol_basesmuma_registration_data_t *sm_reg_data;
int ret, my_rank, name_length;
char *name;
int i;
int bcast_radix;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/*mca_base_component_list_item_t *hdl_cli = NULL;*/
/*int hdl_num;*/
/* at this point I think there is only a sinle shared
memory bcol that we need to be concerned with */
/* No group, no modules */
if (OPAL_UNLIKELY(NULL == module)) {
return NULL;
}
/* allocate and initialize an sm_bcol module */
sm_module = OBJ_NEW(mca_bcol_basesmuma_module_t);
/* set the subgroup */
sm_module->super.sbgp_partner_module=module;
(*num_modules)=1;
cs->super.n_net_contexts = *num_modules;
sm_modules = (mca_bcol_base_module_t **) malloc((cs->super.n_net_contexts)*
sizeof(mca_bcol_base_module_t *));
if( !sm_modules ) {
fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers failed to allocate memory for sm_modules\n");
fflush(stderr);
return NULL;
}
sm_module->reduction_tree = NULL;
sm_module->fanout_read_tree = NULL;
ret=netpatterns_setup_recursive_doubling_tree_node(
module->group_size,module->my_index,
&(sm_module->recursive_doubling_tree));
if(OMPI_SUCCESS != ret) {
fprintf(stderr,"Error setting up recursive_doubling_tree \n");
fflush(stderr);
return NULL;
}
/* setup the fanin tree - this is used only as part of a hierarchical
* barrier, so will set this up with rank 0 as the root */
my_rank=module->my_index;
ret=netpatterns_setup_narray_tree(cs->radix_fanin,
my_rank,module->group_size,&(sm_module->fanin_node));
if(OMPI_SUCCESS != ret) {
fprintf(stderr,"Error setting up fanin tree \n");
fflush(stderr);
return NULL;
}
/* setup the fanout tree - this is used only as part of a hierarchical
* barrier, so will set this up with rank 0 as the root */
ret=netpatterns_setup_narray_tree(cs->radix_fanout,
my_rank,module->group_size,&(sm_module->fanout_node));
if(OMPI_SUCCESS != ret) {
fprintf(stderr,"Error setting up fanout tree \n");
fflush(stderr);
return NULL;
}
/*
* Setup the broadcast tree - this is used only as part of a hierarchical
* bcast, so will set this up with rank 0 as the root.
*/
/* set the radix of the bcast tree */
bcast_radix = cs->radix_read_tree;
/* initialize fan-out read tree */
sm_module->fanout_read_tree=(netpatterns_tree_node_t*) malloc(
sizeof(netpatterns_tree_node_t)*module->group_size);
if( NULL == sm_module->fanout_read_tree ) {
goto Error;
}
for(i = 0; i < module->group_size; i++){
ret = netpatterns_setup_narray_tree(bcast_radix,
i, module->group_size, &(sm_module->fanout_read_tree[i]));
if(OMPI_SUCCESS != ret) {
goto Error;
}
}
ret = load_recursive_knomial_info(sm_module);
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(10, ("Failed to load recursive knomial tree"));
goto Error;
}
/* Allocate offsets array for lmsg reduce */
/*
ret = alloc_lmsg_reduce_offsets_array(sm_module);
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(10, ("Failed to allocate reduce offsets array"));
goto Error;
}
*/
/* initialize reduction tree */
sm_module->reduction_tree=(netpatterns_tree_node_t *) malloc(
sizeof(netpatterns_tree_node_t )*module->group_size);
if( NULL == sm_module->reduction_tree ) {
goto Error;
}
ret=netpatterns_setup_multinomial_tree(
cs->order_reduction_tree,module->group_size,
sm_module->reduction_tree);
if( MPI_SUCCESS != ret ) {
goto Error;
}
/* get largest power of k for given group size */
sm_module->pow_k_levels = pow_sm_k(cs->k_nomial_radix,
sm_module->super.sbgp_partner_module->group_size,
&(sm_module->pow_k));
/* get largest power of 2 for a given group size
* used in scatter allgather
*/
sm_module->pow_2_levels = pow_sm_k(2,
sm_module->super.sbgp_partner_module->group_size,
&(sm_module->pow_2));
/*
* setup scatter data
*/
sm_module->scatter_kary_radix=cs->scatter_kary_radix;
sm_module->scatter_kary_tree=NULL;
ret=netpatterns_setup_narray_tree_contigous_ranks(
sm_module->scatter_kary_radix,
sm_module->super.sbgp_partner_module->group_size,
&(sm_module->scatter_kary_tree));
if(OMPI_SUCCESS != ret) {
fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers and scatter k-ary tree setup failed \n");
fflush(stderr);
return NULL;
}
/* setup the module shared memory management */
ret=base_bcol_basesmuma_setup_library_buffers(sm_module, cs);
if(OMPI_SUCCESS != ret) {
fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers and mpool was not successfully setup!\n");
fflush(stderr);
return NULL;
}
/* setup the collectives and memory management */
/* check to see whether or not the mpool has been inited */
/* allocate some space for the network contexts */
if(!cs->mpool_inited) {
/* if it's empty, then fill it for first time */
cs->super.network_contexts = (bcol_base_network_context_t **)
malloc((cs->super.n_net_contexts)*
sizeof(bcol_base_network_context_t *));
/* you need to do some basic setup - define the file name,
* set data seg alignment and size of cntl structure in sm
* file.
*/
/* give the payload sm file a name */
name_length=asprintf(&name,
"%s"OPAL_PATH_SEP"0%s%0d",
ompi_process_info.job_session_dir,
cs->payload_base_fname,
(int)getpid());
if( 0 > name_length ) {
fprintf(stderr,"Failed to assign the shared memory payload file a name\n");
fflush(stderr);
return NULL;
}
/* make sure name is not too long */
if ( OPAL_PATH_MAX < (name_length-1) ) {
fprintf(stderr,"Shared memory file name is too long!\n");
fflush(stderr);
return NULL;
}
/* set the name and alignment characteristics */
sm_reg_data = (bcol_basesmuma_registration_data_t *) malloc(
sizeof(bcol_basesmuma_registration_data_t));
sm_reg_data->file_name = name;
sm_reg_data->data_seg_alignment = getpagesize();
sm_reg_data->size_ctl_structure = 0;
cs->super.network_contexts[0] = (bcol_base_network_context_t *)
malloc(sizeof(bcol_base_network_context_t));
cs->super.network_contexts[0]->context_data =
(void *) sm_reg_data;
cs->super.network_contexts[0]->
register_memory_fn = mca_bcol_basesmuma_register_sm;
cs->super.network_contexts[0]->
deregister_memory_fn = mca_bcol_basesmuma_deregister_sm;
sm_module->super.network_context = cs->super.network_contexts[0];
} else {
sm_module->super.network_context = cs->super.network_contexts[0];
}
/* Set the header size */
sm_module->super.header_size = sizeof(mca_bcol_basesmuma_header_t);
/* collective setup */
load_func_with_choices(&(sm_module->super));
/*
* This initializes all collective algorithms
*/
ret = mca_bcol_base_bcol_fns_table_init(&(sm_module->super));
if (OMPI_SUCCESS != ret) {
goto Error;
}
sm_module->super.supported_mode = 0;
if (module->use_hdl) {
sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
}
/* Initializes portals library required for basesmuma large message */
#ifdef __PORTALS_AVAIL__
/* Enable zero copy mode */
sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
ret = mca_bcol_basesmuma_portals_init(cs);
if (OMPI_SUCCESS != ret) {
return NULL;
}
sm_module->sg_state.phase = INIT;
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
PTL_EQ_HANDLER_NONE, &sm_module->sg_state.read_eq);
if (ret != PTL_OK) {
BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d",ret));
return NULL;
}
#endif
/* blocking recursive double barrier test */
/*
{
fprintf(stderr,"BBB About to hit the barrier test\n");
fflush(stderr);
int rc;
bcol_function_args_t bogus;
rc = bcol_basesmuma_rd_barrier_init(&(sm_module->super));
rc = bcol_basesmuma_recursive_double_barrier(
&bogus, &(sm_module->super));
}
*/
/* in this case we only expect a single network context.
in the future we should loop around this */
sm_modules[0] = &(sm_module->super);
#if 0
/* debug */
/* test resource recycling */
test_sm_module=sm_module;
/* debug */
fprintf(stderr," ZZZZ sn %lld \n",sm_module->squence_number_offset);
fflush(stderr);
/* end debug */
test_resrouce_recycle();
/* end debug */
#endif
return sm_modules;
Error:
/* cleanup */
if( sm_module->reduction_tree ) {
free(sm_module->reduction_tree);
sm_module->reduction_tree=NULL;
}
return NULL;
}
#if 0
/*
* Init module on the communicator
*/
static int
basesmuma_module_enable(mca_bcol_base_module_t *module,
struct ompi_communicator_t *comm)
{
/* local variables */
char output_buffer[2*MPI_MAX_OBJECT_NAME];
memset(&output_buffer[0],0,sizeof(output_buffer));
snprintf(output_buffer,sizeof(output_buffer),"%s (cid %d)", comm->c_name,
comm->c_contextid);
opal_output_verbose(10, mca_bcol_base_output,
"bcol:basesmuma:enable: new communicator: %s", output_buffer);
/* All done */
return OMPI_SUCCESS;
}
#endif
OBJ_CLASS_INSTANCE(mca_bcol_basesmuma_module_t,
mca_bcol_base_module_t,
mca_bcol_basesmuma_module_construct,
mca_bcol_basesmuma_module_destruct);