diff --git a/src/mca/coll/hierarch/coll_hierarch.c b/src/mca/coll/hierarch/coll_hierarch.c index 998eb97f35..362c8fa25c 100644 --- a/src/mca/coll/hierarch/coll_hierarch.c +++ b/src/mca/coll/hierarch/coll_hierarch.c @@ -25,25 +25,22 @@ #include "mca/coll/base/base.h" #include "coll_hierarch.h" -/** - * NOTE NOTE NOTE NOTE: - * this is a preliminary version dealing just with sm/non-sm layers. - * It's main purpose is to understand the information and data flow - * better, and for developing a first cut of the required interfaces. - * - * EG, Stuttgart, Feb. 24 2005 - */ - #include "mca/ptl/ptl.h" #include "mca/pml/teg/src/pml_teg_proc.h" #include "mca/pml/teg/src/pml_teg_ptl.h" /* local functions and data */ +#define HIER_MAXPROTOCOL 7 +static int mca_coll_hierarch_max_protocol=HIER_MAXPROTOCOL; + +static char hier_prot[HIER_MAXPROTOCOL][5]={"0","tcp","ib","gm","mx","elan4","sm"}; + static void mca_coll_hierarch_checkfor_component (struct ompi_communicator_t *comm, char *component_name, int *key, int *done ); static void mca_coll_hierarch_dump_struct ( struct mca_coll_base_comm_t *c); + /* * Linear set of collective algorithms */ @@ -106,7 +103,7 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority, int size; int color, ncount, maxncount; int *colorarr=NULL; - + int level; /* Get the priority level attached to this module */ if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_priority_param, @@ -120,44 +117,64 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority, return NULL; } - /* Check now, whether all process in this communicator can talk with - sm or not. If yes, then there is no need for the hierarchical - module */ + /* This array will hold the color of each process returned for a certain + protocol. The array is one element larger than required to store + the position of the selected protocol in the hier_prot array. + This avoids, that we have to walk through the whole list in + module_init again. + */ size = ompi_comm_size(comm); - colorarr = (int *) malloc ( sizeof(int) * size ); + colorarr = (int *) malloc ( sizeof(int) * size + 1); if ( NULL == colorarr ) { *priority = 0; return NULL; } - mca_coll_hierarch_checkfor_component ( comm, "sm", &color, &ncount); + /* + * walk through the list of registered protocols, and check which one + * is feasable. + * Later we start with level=0, and introduce the multi-cell check + */ + for ( level = 1; level < mca_coll_hierarch_max_protocol; level++) { + mca_coll_hierarch_checkfor_component ( comm, hier_prot[level], &color, &ncount); - comm->c_coll_basic_module->coll_allreduce (&ncount, &maxncount, 1, MPI_INT, - MPI_MAX, comm ); - comm->c_coll_basic_module->coll_allgather (&color, 1, MPI_INT, - colorarr, 1, MPI_INT, comm ); + comm->c_coll_basic_module->coll_allreduce (&ncount, &maxncount, 1, MPI_INT, + MPI_MAX, comm ); + comm->c_coll_basic_module->coll_allgather (&color, 1, MPI_INT, + colorarr, 1, MPI_INT, comm ); - if ( 1 == maxncount ) { - /* - * this means, no process has a partner to which it can talk with 'sm', - * no need for the hierarchical component - */ - *priority = 0; - return NULL; + if ( 0 == maxncount ) { + /* + * this means, no process has a partner to which it can talk with this protocol, + * so continue to next level + */ + continue; + } + else if ( maxncount == (size-1) ) { + /* + * everybody can talk to every other process with this protocol, + * no need to continue in the hierarchy tree and for the + * hierarchical component. + * Its (size-1) because we do not count ourselves. + */ + goto err_exit; + } + else { + colorarr[size] = level; + *data = (struct mca_coll_base_comm_t *) colorarr; + return &intra; + } } - else if ( maxncount == size ) { - /* - * everybody can talk to every other process with sm, - * no need for the hierarchical module - */ - *priority = 0; - return NULL; + + err_exit: + if ( NULL != colorarr ) { + free ( colorarr ) ; } - *data = (struct mca_coll_base_comm_t *) colorarr; - return &intra; + *priority = 0; + return NULL; } - + /* * Init module on the communicator @@ -168,7 +185,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm) int color, ncount; int *colorarr=NULL, *llr=NULL; int size, rank, ret=OMPI_SUCCESS; - int i, j, c; + int i, j, c, level; int found; struct ompi_communicator_t *llcomm=NULL; @@ -176,7 +193,11 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm) rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); - mca_coll_hierarch_checkfor_component ( comm, "sm", &color, &ncount); + + colorarr = (int *) comm->c_coll_selected_data; + level = colorarr[size+1]; + + mca_coll_hierarch_checkfor_component ( comm, hier_prot[level], &color, &ncount); /* Generate the subcommunicator based on the color returned by the previous function. */ @@ -201,7 +222,6 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm) data->hier_am_lleader=0; /* false */ /* determine how many local leader there are and who they are */ - colorarr = (int *) comm->c_coll_selected_data; llr = (int *) calloc (1, sizeof(int) * size); if (NULL == llr ) { goto exit; @@ -319,44 +339,74 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm, mca_ptl_base_module_t *ptl_module=NULL; mca_ptl_base_component_t *ptr=NULL; - int i, j, size, listsize; + int i, j, size; int counter=0; int firstproc=999999; - int myrank = -1; + int rank = -1; + int listsize=1; + int use_next, walk_through_list; + + /* default values in case an error occurs */ + *ncount=0; + *key=MPI_UNDEFINED; + + /* Shall we just check the first element in the ptl list ? */ + if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_walk_through_list_param, + &walk_through_list)) { + return; + } + + /* Shall we use the first_elem list or the next_elem list? */ + if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_use_next_param, + &use_next)) { + return; + } + + size = ompi_comm_size ( comm ); + rank = ompi_comm_rank ( comm ); for ( i=0; iproc_ptl_next); - listsize = mca_ptl_array_get_size(&proc->proc_ptl_next); -#else - ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_first); - listsize = mca_ptl_array_get_size(&proc->proc_ptl_first); -#endif - for ( j=0; jproc_ptl_first); + if ( walk_through_list ) { + /* + * Walking through the listmight be unecessary. Assumption is, + * that if we did not register this as the first protocol, there is + * a protocol which is faster than this one. + * + * Example: on a IB-cluster with dual processor nodes, I can talk + * to all procs with IB, however the process on my node will + * hopefully have sm registered as its first protocoll. + */ + + listsize = mca_ptl_array_get_size(&proc->proc_ptl_first); + } + } + else { + ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_next); + if ( walk_through_list ) { + listsize = mca_ptl_array_get_size(&proc->proc_ptl_next); + } + } + + for ( j=0; jptl; ptr = ptl_module->ptl_component; - + /* sanity check */ if ( strcmp(ptr->ptlm_version.mca_type_name,"ptl") ) { printf("Oops, got the wrong component! type_name = %s\n", ptr->ptlm_version.mca_type_name ); } - /* check for myself. - ATTENTION: this relies on having the self-ptl-component loaded - at this case. Need something better! - */ - if ( !strcmp (ptr->ptlm_version.mca_component_name, "self")) { - counter++; - myrank = i; - continue; - } - - /* check for the required component */ if (! strcmp (ptr->ptlm_version.mca_component_name, component_name)){ counter++; @@ -367,21 +417,17 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm, } } } - + *ncount = counter; /* true */ /* final decision */ - if ( counter == 1 ) { + if ( counter == 0 ) { /* this is the section indicating, that we are not using this component */ - if ( myrank == -1 ) { - } - else { - firstproc = MPI_UNDEFINED; - } + firstproc = MPI_UNDEFINED; } else { - if ( myrank < firstproc ) { - firstproc = myrank; + if ( rank < firstproc ) { + firstproc = rank; } } diff --git a/src/mca/coll/hierarch/coll_hierarch_bcast.c b/src/mca/coll/hierarch/coll_hierarch_bcast.c index f682c5e933..63c0415f86 100644 --- a/src/mca/coll/hierarch/coll_hierarch_bcast.c +++ b/src/mca/coll/hierarch/coll_hierarch_bcast.c @@ -25,22 +25,8 @@ #include "mca/coll/base/coll_tags.h" #include "coll_hierarch.h" -static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, - int count, - ompi_datatype_t * datatype, - int root, - ompi_communicator_t * comm, - int segsize, - struct mca_coll_hierarch_topo *topo); -static int mca_coll_hierarch_intra_bcast_setup_topo (int count, - ompi_datatype_t *datatype, - int root, - struct mca_coll_base_comm_t *data, - int *segsize); -static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ); - -#ifdef SIMPLE_HIERARCH_BCAST +#ifdef SIMPLE_HIERARCH /* * bcast_intra * @@ -108,6 +94,22 @@ int mca_coll_hierarch_bcast_intra(void *buff, } #else +static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, + int count, + ompi_datatype_t * datatype, + int root, + ompi_communicator_t * comm, + int segsize, + struct mca_coll_hierarch_topo *topo); + +static int mca_coll_hierarch_intra_bcast_setup_topo (int count, + ompi_datatype_t *datatype, + int root, + struct mca_coll_base_comm_t *data, + int *segsize); +static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ); + + int mca_coll_hierarch_bcast_intra(void *buff, int count, struct ompi_datatype_t *datatype, @@ -242,7 +244,12 @@ static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, /* Post Irecv if not root-node */ if (rank != root) { /* has a parent. need to receive before sending */ - recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments ); + if ( num_segments > 2 * size ) { + recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments ); + } + else { + recv_request = comm->c_coll_selected_data->hier_reqs; + } for( i = 0; i < num_segments; i++) { if ( i == (num_segments -1) ) { @@ -301,8 +308,10 @@ static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, } /* for ( i = 0; i < num_segments; i++) */ } - if(recv_request != NULL) { - free(recv_request); + if ( num_segments > 2 * size ) { + if(recv_request != NULL) { + free(recv_request); + } } return OMPI_SUCCESS; @@ -356,7 +365,7 @@ static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ) } else { size = data->hier_num_lleaders + 1; - data->hier_lleaders[data->hier_num_lleaders] = root; + data->hier_lleaders[rootpos] = root; } rank = data->hier_my_lleader; diff --git a/src/mca/coll/hierarch/coll_hierarch_component.c b/src/mca/coll/hierarch/coll_hierarch_component.c index 32112b0b1d..951445c4b4 100644 --- a/src/mca/coll/hierarch/coll_hierarch_component.c +++ b/src/mca/coll/hierarch/coll_hierarch_component.c @@ -40,6 +40,9 @@ const char *mca_coll_hierarch_component_version_string = int mca_coll_hierarch_priority_param = -1; int mca_coll_hierarch_verbose_param = -1; int mca_coll_hierarch_verbose = 0; +int mca_coll_hierarch_walk_through_list_param=-1; +int mca_coll_hierarch_use_next_param=-1; + /* * Local function @@ -93,13 +96,17 @@ const mca_coll_base_component_1_0_0_t mca_coll_hierarch_component = { static int hierarch_open(void) { - /* Use a low priority, but allow other components to be lower */ + /* Use a high priority, but allow other components to be higher */ mca_coll_hierarch_priority_param = - mca_base_param_register_int("coll", "hierarch", "priority", NULL, 20); + mca_base_param_register_int("coll", "hierarch", "priority", NULL, 50); mca_coll_hierarch_verbose_param = mca_base_param_register_int("coll", "hierarch", "verbose", NULL, mca_coll_hierarch_verbose); + mca_coll_hierarch_walk_through_list_param = + mca_base_param_register_int("coll", "hierarch", "walk_through_list", NULL, 0); + mca_coll_hierarch_use_next_param = + mca_base_param_register_int("coll", "hierarch", "use_next", NULL, 0); return OMPI_SUCCESS; } diff --git a/src/mca/coll/hierarch/coll_hierarch_reduce.c b/src/mca/coll/hierarch/coll_hierarch_reduce.c index d05afe5eb3..ec424616d6 100644 --- a/src/mca/coll/hierarch/coll_hierarch_reduce.c +++ b/src/mca/coll/hierarch/coll_hierarch_reduce.c @@ -27,6 +27,7 @@ #include "coll_hierarch.h" +#ifdef SIMPLE_HIERARCH /* * reduce_intra * @@ -34,6 +35,98 @@ * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ +int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, struct ompi_communicator_t *comm) +{ + struct mca_coll_base_comm_t *data=NULL; + struct ompi_communicator_t *llcomm=NULL; + long true_lb, true_extent, lb, extent; + char *free_buffer = NULL; + char *pml_buffer = NULL; + int i, rank, ret; + + rank = ompi_comm_rank ( comm ); + data = comm->c_coll_selected_data; + llcomm = data->hier_llcomm; + + + /* + * collect the data from the low-level communicators. Result will be stored + * on the local leaders. + */ + if ( MPI_COMM_NULL != llcomm ) { + ret = llcomm->c_coll.coll_reduce(sbuf, rbuf, count, dtype, op, + data->hier_my_lleader, llcomm ); + } + + + /* trivial linear reduction receiving the data from all local leaders. + need something significantly better */ + if ( rank == root ) { + /* Root receives and reduces messages */ + ompi_ddt_get_extent(dtype, &lb, &extent); + ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent); + + free_buffer = malloc(true_extent + (count - 1) * extent); + if (NULL == free_buffer) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + pml_buffer = free_buffer - lb; + + if ( !data->hier_am_lleader ) { + /* Initialize the receive buffer. */ + ret = mca_pml.pml_recv(rbuf, count, dtype, data->hier_lleader[0], + MCA_COLL_BASE_TAG_REDUCE, comm, + MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { + goto exit; + } + } + + /* Loop receiving and calling reduction function (C or Fortran). */ + for (i = 1; i < data->hier_num_lleaders; i++) { + if ( data->hier_lleader[i] == rank ) { + continue; + } + ret = mca_pml.pml_recv(pml_buffer, count, dtype, data->hier_lleader[i], + MCA_COLL_BASE_TAG_REDUCE, comm, + MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { + goto exit; + } + + /* Perform the reduction */ + ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); + } + } + else if ( data->hier_am_lleader ) { + if ( MPI_COMM_NULL != llcomm ) { + ret = mca_pml.pml_send ( rbuf, count, dtype, root, + MCA_COLL_BASE_TAG_REDUCE, + MCA_PML_BASE_SEND_STANDARD, + comm); + } + else { + ret = mca_pml.pml_send ( sbuf, count, dtype, root, + MCA_COLL_BASE_TAG_REDUCE, + MCA_PML_BASE_SEND_STANDARD, + comm); + } + } + + exit: + if ( NULL != free_buffer ) { + free ( free_buffer); + } + + + return ret; +} + +#else + int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, @@ -44,4 +137,5 @@ int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count, op, root, comm); } +#endif