/* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "ompi/include/ompi/constants.h" #include "ompi/mca/coll/ml/coll_ml.h" #include "ompi/mca/bcol/bcol.h" #include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h" /* #define IS_BARRIER_READY(peer, my_flag, my_sequence_number)\ (((peer)->sequence_number == (my_sequence_number) && \ (peer)->flags[BARRIER_RKING_FLAG][bcol_id] >= (my_flag) \ )? true : false ) */ #define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \ do{ \ for( j = 0; j < (tree_order - 1); j++){ \ if( 0 > peers[j] ) { \ /* set the bit */ \ *active_requests ^= (1<bcol_module; mca_common_netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; int leading_dim, buff_idx, idx; int bcol_id = (int) bcol_module->super.bcol_id; int i, j, probe; int src; int pow_k, tree_order; int max_requests = 0; /* important to initialize this */ int matched = 0; int64_t sequence_number=input_args->sequence_num; int my_rank = bcol_module->super.sbgp_partner_module->my_index; volatile mca_bcol_basesmuma_payload_t *data_buffs; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; #if 0 fprintf(stderr,"entering sm barrier sn = %d buff index = %d\n",sequence_number,input_args->buffer_index); #endif /* initialize the iteration counter */ buff_idx = input_args->buffer_index; leading_dim = bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; /* Set pointer to current proc ctrl region */ my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* init the header */ BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); /* initialize these */ *iteration = 0; *active_requests = 0; *status = 0; /* k-nomial parameters */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; /* calculate the maximum number of requests * at each level each rank communicates with * at most (k - 1) peers * so if we set k - 1 bit fields in "max_requests", then * we have max_request == 2^(k - 1) -1 */ for(i = 0; i < (tree_order - 1); i++){ max_requests ^= (1<node_type ) { /* then I will signal to my proxy rank*/ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; ready_flag = flag_offset + 1 + pow_k + 2; /* now, poll for completion */ src = exchange_node->rank_extra_sources_array[0]; peer_ctl_pointer = data_buffs[src].ctl_struct; for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) { if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; goto FINISHED; } } /* cache state and bail */ *iteration = -1; return BCOL_FN_STARTED; }else if ( 0 < exchange_node->n_extra_sources ) { /* I am a proxy for someone */ src = exchange_node->rank_extra_sources_array[0]; peer_ctl_pointer = data_buffs[src].ctl_struct; /* probe for extra rank's arrival */ for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) { if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; /* copy it in */ goto MAIN_PHASE; } } *status = ready_flag; *iteration = -1; return BCOL_FN_STARTED; } MAIN_PHASE: /* bump the ready flag */ ready_flag++; /* we start the recursive k - ing phase */ for( *iteration = 0; *iteration < pow_k; (*iteration)++) { /* announce my arrival */ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; /* calculate the number of active requests */ CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order); /* Now post the recv's */ for( j = 0; j < (tree_order - 1); j++ ) { /* recv phase */ src = exchange_node->rank_exchanges[*iteration][j]; if( src < 0 ) { /* then not a valid rank, continue */ continue; } peer_ctl_pointer = data_buffs[src].ctl_struct; if( !(*active_requests&(1<num_to_probe && (0 == matched); probe++){ if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; /* set this request's bit */ *active_requests ^= (1<flags[BARRIER_RKING_FLAG][bcol_id]; return BCOL_FN_STARTED; } } /* bump the flag one more time for the extra rank */ ready_flag = flag_offset + 1 + pow_k + 2; /* finish off the last piece, send the data back to the extra */ if( 0 < exchange_node->n_extra_sources ) { /* simply announce my arrival */ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; } FINISHED: my_ctl_pointer->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; } /* allgather progress function */ int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args, struct coll_ml_function_t *const_args) { /* local variables */ int flag_offset; volatile int8_t ready_flag; mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module; mca_common_netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; int *iter = iteration; /* double alias */ int leading_dim, idx, buff_idx; int i, j, probe; int src; int max_requests = 0; /* critical to set this */ int pow_k, tree_order; int bcol_id = (int) bcol_module->super.bcol_id; int matched = 0; int64_t sequence_number=input_args->sequence_num; int my_rank = bcol_module->super.sbgp_partner_module->my_index; volatile mca_bcol_basesmuma_payload_t *data_buffs; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; #if 0 fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank, *active_requests,*iter,*status); #endif buff_idx = buffer_index; leading_dim=bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* increment the starting flag by one and return */ flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; ready_flag = *status; /* k-nomial parameters */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; /* calculate the maximum number of requests * at each level each rank communicates with * at most (k - 1) peers * so if we set k - 1 bit fields in "max_requests", then * we have max_request == 2^(k - 1) -1 */ for(i = 0; i < (tree_order - 1); i++){ max_requests ^= (1<node_type ) { /* If I'm in here, then I must be looking for data */ ready_flag = flag_offset + 1 + pow_k + 2; src = exchange_node->rank_extra_sources_array[0]; peer_ctl_pointer = data_buffs[src].ctl_struct; for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) { if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; goto FINISHED; } } /* haven't found it, state is cached, bail out */ return BCOL_FN_STARTED; }else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) { /* I am a proxy for someone */ src = exchange_node->rank_extra_sources_array[0]; peer_ctl_pointer = data_buffs[src].ctl_struct; /* probe for extra rank's arrival */ for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) { if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; /* bump the flag */ ready_flag++; *iteration = 0; goto MAIN_PHASE; } } return BCOL_FN_STARTED; } MAIN_PHASE: /* start the recursive k - ing phase */ for( *iter=*iteration; *iter < pow_k; (*iter)++) { /* I am ready at this level */ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; if( 0 == *active_requests ) { /* flip some bits, if we don't have active requests from a previous visit */ CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order); } for( j = 0; j < (tree_order - 1); j++ ) { /* recv phase */ src = exchange_node->rank_exchanges[*iter][j]; if( src < 0 ) { /* then not a valid rank, continue */ continue; } peer_ctl_pointer = data_buffs[src].ctl_struct; if( !(*active_requests&(1<num_to_probe && (0 == matched); probe++){ if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; /* flip the request's bit */ *active_requests ^= (1<flags[BARRIER_RKING_FLAG][bcol_id]; return BCOL_FN_STARTED; } } /* bump the flag one more time for the extra rank */ ready_flag = flag_offset + 1 + pow_k + 2; /* finish off the last piece, send the data back to the extra */ if( 0 < exchange_node->n_extra_sources ) { /* simply announce my arrival */ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; } FINISHED: my_ctl_pointer->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; } /* Register k-nomial barrier functions to the BCOL function table, * so they can be selected */ int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super) { mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; comm_attribs.bcoll_type = BCOL_BARRIER; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_KNOWN; mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_k_nomial_barrier_init, bcol_basesmuma_k_nomial_barrier_progress); return OMPI_SUCCESS; }