diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 4c6a7a7b4f..649979746d 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ -/* -** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). -*/ -int ompi_comm_split( ompi_communicator_t* comm, int color, int key, - ompi_communicator_t **newcomm, bool pass_on_topo ) + +int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ) { int myinfo[2]; int size, my_size; @@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d", newcomp->c_contextid, comm->c_contextid ); - + /* Copy info if there is one */ + if (info) { + newcomp->super.s_info = OBJ_NEW(opal_info_t); + opal_info_dup(info, &(newcomp->super.s_info)); + } /* Activate the communicator and init coll-component */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, } +/* +** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). +*/ +int ompi_comm_split( ompi_communicator_t* comm, int color, int key, + ompi_communicator_t **newcomm, bool pass_on_topo ) +{ + return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo); +} + /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 8936b7f1df..01c0261488 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm, OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key, ompi_communicator_t** newcomm, bool pass_on_topo); +/** + * split a communicator based on color and key. Parameters + * are identical to the MPI-counterpart of the function. + * Similar to \see ompi_comm_split with an additional info parameter. + * + * @param comm: input communicator + * @param color + * @param key + * + * @ + */ +OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ); + /** * split a communicator based on type and key. Parameters * are identical to the MPI-counterpart of the function. diff --git a/ompi/group/group.c b/ompi/group/group.c index f5cc88be98..9e368c96da 100644 --- a/ompi/group/group.c +++ b/ompi/group/group.c @@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group) return false; } + +/** + * Count the number of processes on this group that share the same node as + * this process. + */ +int ompi_group_count_local_peers (ompi_group_t *group) +{ + int local_peers = 0; + for (int i = 0 ; i < group->grp_proc_count ; ++i) { + ompi_proc_t *proc = NULL; +#if OMPI_GROUP_SPARSE + proc = ompi_group_peer_lookup (group, i); +#else + proc = ompi_group_get_proc_ptr_raw (group, i); + if (ompi_proc_is_sentinel (proc)) { + /* the proc must be stored in the group or cached in the proc + * hash table if the process resides in the local node + * (see ompi_proc_complete_init) */ + continue; + } +#endif + if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { + local_peers++; + } + } + + return local_peers; +} diff --git a/ompi/group/group.h b/ompi/group/group.h index 661666246e..d1cf7d99ae 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t return ompi_group_get_proc_ptr (group, peer_id, false); } +/** + * Return true if all processes in the group are not on the local node. + */ bool ompi_group_have_remote_peers (ompi_group_t *group); +/** + * Count the number of processes on the local node. + */ +int ompi_group_count_local_peers (ompi_group_t *group); + /** * Function to print the group info */ diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index b22982c011..605d626230 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req) || (context->con->tree->tree_nextsize > 0 && rank != context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } @@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req) && num_recv_fini == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index b853f1ad26..8c6023d411 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -21,6 +21,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,6 +38,7 @@ #include "mpi.h" #include "ompi/communicator/communicator.h" #include "opal/util/output.h" +#include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/class/opal_list.h" #include "opal/class/opal_object.h" @@ -44,20 +46,12 @@ #include "opal/mca/base/base.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" - +#include "ompi/mca/coll/base/coll_base_util.h" /* - * Local types + * Stuff for the OBJ interface */ -struct avail_coll_t { - opal_list_item_t super; - - int ac_priority; - mca_coll_base_module_2_3_0_t *ac_module; - const char * ac_component_name; -}; -typedef struct avail_coll_t avail_coll_t; - +OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL); /* * Local functions @@ -77,12 +71,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t * int *priority, mca_coll_base_module_2_3_0_t ** module); -/* - * Stuff for the OBJ interface - */ -static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL); - - #define COPY(module, comm, func) \ do { \ if (NULL != module->coll_ ## func) { \ @@ -138,11 +126,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) /* FIX ME - Do some kind of collective operation to find a module that everyone has available */ + /* List to store every valid module */ + comm->c_coll->module_list = OBJ_NEW(opal_list_t); + /* do the selection loop */ for (item = opal_list_remove_first(selectable); NULL != item; item = opal_list_remove_first(selectable)) { - avail_coll_t *avail = (avail_coll_t *) item; + mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item; /* initialize the module */ ret = avail->ac_module->coll_module_enable(avail->ac_module, comm); @@ -153,6 +144,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) (OMPI_SUCCESS == ret ? "Enabled": "Disabled") ); if (OMPI_SUCCESS == ret) { + /* Save every component that is initialized, + * queried and enabled successfully */ + opal_list_append(comm->c_coll->module_list, &avail->super); /* copy over any of the pointers */ COPY(avail->ac_module, comm, allgather); @@ -230,10 +224,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) COPY(avail->ac_module, comm, neighbor_alltoallw_init); COPY(avail->ac_module, comm, reduce_local); + } else { + /* release the original module reference and the list item */ + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); } - /* release the original module reference and the list item */ - OBJ_RELEASE(avail->ac_module); - OBJ_RELEASE(avail); } /* Done with the list from the check_components() call so release it. */ @@ -306,8 +301,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) static int avail_coll_compare (opal_list_item_t **a, opal_list_item_t **b) { - avail_coll_t *acoll = (avail_coll_t *) *a; - avail_coll_t *bcoll = (avail_coll_t *) *b; + mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a; + mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b; if (acoll->ac_priority > bcoll->ac_priority) { return 1; @@ -318,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a, return 0; } +static inline int +component_in_argv(char **argv, const char* component_name) +{ + if( NULL != argv ) { + while( NULL != *argv ) { + if( 0 == strcmp(component_name, *argv) ) { + return 1; + } + argv++; /* move to the next argument */ + } + } + return 0; +} + /* * For each module in the list, check and see if it wants to run, and * do the resulting priority comparison. Make a list of modules to be @@ -327,13 +336,66 @@ static int avail_coll_compare (opal_list_item_t **a, static opal_list_t *check_components(opal_list_t * components, ompi_communicator_t * comm) { - int priority; + int priority, flag; const mca_base_component_t *component; mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; opal_list_t *selectable; - avail_coll_t *avail; + mca_coll_base_avail_coll_t *avail; + char info_val[OPAL_MAX_INFO_VAL+1]; + char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL; + /* Check if this communicator comes with restrictions on the collective modules + * it wants to use. The restrictions are consistent with the MCA parameter + * to limit the collective components loaded, but it applies for each + * communicator and is provided as an info key during the communicator + * creation. Unlike the MCA param, this info key is used not to select + * components but either to prevent components from being used or to + * force a change in the component priority. + */ + if( NULL != comm->super.s_info) { + opal_info_get(comm->super.s_info, "ompi_comm_coll_preference", + sizeof(info_val), info_val, &flag); + if( !flag ) { + goto proceed_to_select; + } + coll_argv = opal_argv_split(info_val, ','); + if(NULL == coll_argv) { + goto proceed_to_select; + } + int idx2, count_include = opal_argv_count(coll_argv); + /* Allocate the coll_include argv */ + coll_include = (char**)malloc((count_include + 1) * sizeof(char*)); + coll_include[count_include] = NULL; /* NULL terminated array */ + /* Dispatch the include/exclude in the corresponding arrays */ + for( int idx = 0; NULL != coll_argv[idx]; idx++ ) { + if( '^' == coll_argv[idx][0] ) { + coll_include[idx] = NULL; /* NULL terminated array */ + + /* Allocate the coll_exclude argv */ + coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*)); + /* save the exclude components */ + for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) { + coll_exclude[idx2 - idx] = coll_argv[idx2]; + } + coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */ + coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */ + count_include = idx; + break; + } + coll_include[idx] = coll_argv[idx]; + } + /* Reverse the order of the coll_inclide argv to faciliate the ordering of + * the selected components reverse. + */ + for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) { + char* temp = coll_include[idx2]; + coll_include[idx2] = coll_include[count_include - 1]; + coll_include[count_include - 1] = temp; + count_include--; + } + } + proceed_to_select: /* Make a list of the components that query successfully */ selectable = OBJ_NEW(opal_list_t); @@ -341,11 +403,18 @@ static opal_list_t *check_components(opal_list_t * components, OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) { component = cli->cli_component; + /* dont bother is we have this component in the exclusion list */ + if( component_in_argv(coll_exclude, component->mca_component_name) ) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:base:comm_select: component disqualified: %s (due to communicator info key)", + component->mca_component_name ); + continue; + } priority = check_one_component(comm, component, &module); if (priority >= 0) { /* We have a component that indicated that it wants to run by giving us a module */ - avail = OBJ_NEW(avail_coll_t); + avail = OBJ_NEW(mca_coll_base_avail_coll_t); avail->ac_priority = priority; avail->ac_module = module; // Point to the string so we don't have to free later @@ -376,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components, /* Put this list in priority order */ opal_list_sort(selectable, avail_coll_compare); + /* For all valid component reorder them not on their provided priorities but on + * the order requested in the info key. As at this point the coll_include is + * already ordered backward we can simply prepend the components. + */ + mca_coll_base_avail_coll_t *item, *item_next; + OPAL_LIST_FOREACH_SAFE(item, item_next, + selectable, mca_coll_base_avail_coll_t) { + if( component_in_argv(coll_include, item->ac_component_name) ) { + opal_list_remove_item(selectable, &item->super); + opal_list_prepend(selectable, &item->super); + } + } + + opal_argv_free(coll_argv); + if( NULL != coll_exclude ) { + free(coll_exclude); + } + if( NULL != coll_include ) { + free(coll_include); + } + /* All done */ return selectable; } @@ -409,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm, return priority; } - /************************************************************************** * Query functions **************************************************************************/ diff --git a/ompi/mca/coll/base/coll_base_comm_unselect.c b/ompi/mca/coll/base/coll_base_comm_unselect.c index fea0a53ec7..0e0f1bb5bf 100644 --- a/ompi/mca/coll/base/coll_base_comm_unselect.c +++ b/ompi/mca/coll/base/coll_base_comm_unselect.c @@ -16,6 +16,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,7 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/base/base.h" +#include "ompi/mca/coll/base/coll_base_util.h" #define CLOSE(comm, func) \ do { \ @@ -50,6 +52,8 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm) { + opal_list_item_t *item; + CLOSE(comm, allgather); CLOSE(comm, allgatherv); CLOSE(comm, allreduce); @@ -124,6 +128,17 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm) CLOSE(comm, reduce_local); + for (item = opal_list_remove_first(comm->c_coll->module_list); + NULL != item; item = opal_list_remove_first(comm->c_coll->module_list)) { + mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item; + + if(avail->ac_module) { + OBJ_RELEASE(avail->ac_module); + } + OBJ_RELEASE(avail); + } + OBJ_RELEASE(comm->c_coll->module_list); + free(comm->c_coll); comm->c_coll = NULL; diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 91dd677dbc..99c8b516a2 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -29,6 +29,8 @@ #include "ompi/mca/topo/base/base.h" #include "ompi/mca/pml/pml.h" #include "coll_base_util.h" +#include "coll_base_functions.h" +#include int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, @@ -268,7 +270,7 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req, } else { scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm); } - + for (int i=0; icb.req_complete_cb = NULL; req->req_complete_cb_data = NULL; req->data.objs.objs[0] = NULL; @@ -305,3 +308,253 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) { } OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL); + +/* File reading functions */ +static void skiptonewline (FILE *fptr, int *fileline) +{ + char val; + int rc; + + do { + rc = fread(&val, 1, 1, fptr); + if (0 == rc) { + return; + } + if ('\n' == val) { + (*fileline)++; + return; + } + } while (1); +} + +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val) +{ + char trash; + int rc; + + do { + rc = fscanf(fptr, "%li", val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val) +{ + char trash, token[32]; + int rc; + + *val = NULL; /* security in case we fail */ + do { + rc = fscanf(fptr, "%32s", token); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + if( '#' == token[0] ) { + skiptonewline(fptr, fileline); + continue; + } + *val = (char*)malloc(strlen(token) + 1); + strcpy(*val, token); + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val) +{ + char trash; + int rc; + + do { + rc = fscanf(fptr, "%" PRIsize_t, val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected) +{ + char trash; + int rc; + + do { + rc = fread(&trash, sizeof(char), 1, fptr); + if (0 == rc) { /* hit the end of the file */ + return -1; + } + if ('\n' == trash) { + (*fileline)++; + continue; + } + if ('#' == trash) { + skiptonewline (fptr, fileline); + continue; + } + if( trash == expected ) + return 1; /* return true and eat the char */ + if( isblank(trash) ) /* skip all spaces if that's not what we were looking for */ + continue; + if( 0 != fseek(fptr, -1, SEEK_CUR) ) + return -1; + return 0; + } while (1); +} + +/** + * There are certainly simpler implementation for this function when performance + * is not a critical point. But, as this function is used during the collective + * configuration, and we can do this configurations once for each communicator, + * I would rather have a more complex but faster implementation. + * The approach here is to search for the largest common denominators, to create + * something similar to a dichotomic search. + */ +int mca_coll_base_name_to_colltype(const char* name) +{ + if( 'n' == name[0] ) { + if( 0 == strncmp(name, "neighbor_all", 12) ) { + if( 't' != name[12] ) { + if( 0 == strncmp(name+12, "gather", 6) ) { + if('\0' == name[18]) return NEIGHBOR_ALLGATHER; + if( 'v' == name[18]) return NEIGHBOR_ALLGATHERV; + } + } else { + if( 0 == strncmp(name+12, "toall", 5) ) { + if( '\0' == name[17] ) return NEIGHBOR_ALLTOALL; + if( 'v' == name[17] ) return NEIGHBOR_ALLTOALLV; + if( 'w' == name[17] ) return NEIGHBOR_ALLTOALLW; + } + } + } + return -1; + } + if( 'a' == name[0] ) { + if( 0 != strncmp(name, "all", 3) ) { + return -1; + } + if( 't' != name[3] ) { + if( 'r' == name[3] ) { + if( 0 == strcmp(name+3, "reduce") ) + return ALLREDUCE; + } else { + if( 0 == strncmp(name+3, "gather", 6) ) { + if( '\0' == name[9] ) return ALLGATHER; + if( 'v' == name[9] ) return ALLGATHERV; + } + } + } else { + if( 0 == strncmp(name+3, "toall", 5) ) { + if( '\0' == name[8] ) return ALLTOALL; + if( 'v' == name[8] ) return ALLTOALLV; + if( 'w' == name[8] ) return ALLTOALLW; + } + } + return -1; + } + if( 'r' > name[0] ) { + if( 'b' == name[0] ) { + if( 0 == strcmp(name, "barrier") ) + return BARRIER; + if( 0 == strcmp(name, "bcast") ) + return BCAST; + } else if( 'g'== name[0] ) { + if( 0 == strncmp(name, "gather", 6) ) { + if( '\0' == name[6] ) return GATHER; + if( 'v' == name[6] ) return GATHERV; + } + } + if( 0 == strcmp(name, "exscan") ) + return EXSCAN; + return -1; + } + if( 's' > name[0] ) { + if( 0 == strncmp(name, "reduce", 6) ) { + if( '\0' == name[6] ) return REDUCE; + if( '_' == name[6] ) { + if( 0 == strncmp(name+7, "scatter", 7) ) { + if( '\0' == name[14] ) return REDUCESCATTER; + if( 0 == strcmp(name+14, "_block") ) return REDUCESCATTERBLOCK; + } + } + } + return -1; + } + if( 0 == strcmp(name, "scan") ) + return SCAN; + if( 0 == strcmp(name, "scatterv") ) + return SCATTERV; + if( 0 == strcmp(name, "scatter") ) + return SCATTER; + return -1; +} + +/* conversion table for all COLLTYPE_T values defined in ompi/mca/coll/base/coll_base_functions.h */ +static const char* colltype_translation_table[] = { + [ALLGATHER] = "allgather", + [ALLGATHERV] = "allgatherv", + [ALLREDUCE] = "allreduce", + [ALLTOALL] = "alltoall", + [ALLTOALLV] = "alltoallv", + [ALLTOALLW] = "alltoallw", + [BARRIER] = "barrier", + [BCAST] = "bcast", + [EXSCAN] = "exscan", + [GATHER] = "gather", + [GATHERV] = "gatherv", + [REDUCE] = "reduce", + [REDUCESCATTER] = "reduce_scatter", + [REDUCESCATTERBLOCK] = "reduce_scatter_block", + [SCAN] = "scan", + [SCATTER] = "scatter", + [SCATTERV] = "scatterv", + [NEIGHBOR_ALLGATHER] = "neighbor_allgather", + [NEIGHBOR_ALLGATHERV] = "neighbor_allgatherv", + [NEIGHBOR_ALLTOALL] = "neighbor_alltoall", + [NEIGHBOR_ALLTOALLV] = "neighbor_alltoallv", + [NEIGHBOR_ALLTOALLW] = "neighbor_alltoallw", + [COLLCOUNT] = NULL +}; + +char* mca_coll_base_colltype_to_str(int collid) +{ + if( (collid < 0) || (collid >= COLLCOUNT) ) { + return NULL; + } + return strdup(colltype_translation_table[collid]); +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 05eaa41953..ee649fa63f 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -84,6 +84,19 @@ ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve) typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t; +/* + * Structure to store an available module + */ +struct mca_coll_base_avail_coll_t { + opal_list_item_t super; + + int ac_priority; + mca_coll_base_module_t *ac_module; + const char * ac_component_name; +}; +typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t; +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t); + /** * A MPI_like function doing a send and a receive simultaneously. * If one of the communications results in a zero-byte message the @@ -164,5 +177,18 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, ompi_datatype_t * const stypes[], ompi_datatype_t * const rtypes[]); +/* File reading function */ +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val); +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val); +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); +/* peek at the next valid token to see if it begins with the expected value. If yes + * eat the value, otherwise put it back into the file. + */ +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); + +/* Miscelaneous function */ +char* mca_coll_base_colltype_to_str(int collid); +int mca_coll_base_name_to_colltype(const char* name); + END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/coll.h b/ompi/mca/coll/coll.h index f852f26732..57e4af4ac0 100644 --- a/ompi/mca/coll/coll.h +++ b/ompi/mca/coll/coll.h @@ -19,6 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -767,6 +768,9 @@ struct mca_coll_base_comm_coll_t { mca_coll_base_module_reduce_local_fn_t coll_reduce_local; mca_coll_base_module_2_3_0_t *coll_reduce_local_module; + + /* List of modules initialized, queried and enabled */ + opal_list_t *module_list; }; typedef struct mca_coll_base_comm_coll_t mca_coll_base_comm_coll_t; diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am new file mode 100644 index 0000000000..61b40d97c5 --- /dev/null +++ b/ompi/mca/coll/han/Makefile.am @@ -0,0 +1,54 @@ +# +# Copyright (c) 2018-2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ +coll_han.h \ +coll_han_trigger.h \ +coll_han_dynamic.h \ +coll_han_dynamic_file.h \ +coll_han_bcast.c \ +coll_han_reduce.c \ +coll_han_scatter.c \ +coll_han_gather.c \ +coll_han_allreduce.c \ +coll_han_allgather.c \ +coll_han_component.c \ +coll_han_module.c \ +coll_han_trigger.c \ +coll_han_dynamic.c \ +coll_han_dynamic_file.c \ +coll_han_topo.c \ +coll_han_subcomms.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +component_noinst = +component_install = +if MCA_BUILD_ompi_coll_han_DSO +component_install += mca_coll_han.la +else +component_noinst += libmca_coll_han.la +endif + +# See ompi/mca/btl/sm/Makefile.am for an explanation of +# libmca_common_sm.la. + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_han_la_SOURCES = $(sources) +mca_coll_han_la_LDFLAGS = -module -avoid-version +mca_coll_han_la_LIBADD = + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_han_la_SOURCES =$(sources) +libmca_coll_han_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h new file mode 100644 index 0000000000..16efcbe8e5 --- /dev/null +++ b/ompi/mca/coll/han/coll_han.h @@ -0,0 +1,539 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_EXPORT_H +#define MCA_COLL_HAN_EXPORT_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_han_trigger.h" +#include "ompi/mca/coll/han/coll_han_dynamic.h" + +/* + * Today; + * . only 2 modules available for intranode (low) level + * . only 2 modules available for internode (up) level + */ + +#define COLL_HAN_LOW_MODULES 2 +#define COLL_HAN_UP_MODULES 2 + +struct mca_coll_han_bcast_args_s { + mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + void *buff; + ompi_datatype_t *dtype; + int seg_count; + int root_low_rank; + int root_up_rank; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; +}; +typedef struct mca_coll_han_bcast_args_s mca_coll_han_bcast_args_t; + +struct mca_coll_han_reduce_args_s { + mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + void *sbuf; + void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; + int seg_count; + int root_low_rank; + int root_up_rank; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; + bool is_tmp_rbuf; +}; +typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t; + +struct mca_coll_han_allreduce_args_s { + mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; + void *sbuf; + void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; + int seg_count; + int root_up_rank; + int root_low_rank; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; + int *completed; +}; +typedef struct mca_coll_han_allreduce_args_s mca_coll_han_allreduce_args_t; + +struct mca_coll_han_scatter_args_s { + mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; + void *sbuf; + void *sbuf_inter_free; + void *sbuf_reorder_free; + void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; + int rcount; + int root; + int root_up_rank; + int root_low_rank; + int w_rank; + bool noop; +}; +typedef struct mca_coll_han_scatter_args_s mca_coll_han_scatter_args_t; + +struct mca_coll_han_gather_args_s { + mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; + void *sbuf; + void *sbuf_inter_free; + void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; + int rcount; + int root; + int root_up_rank; + int root_low_rank; + int w_rank; + bool noop; + bool is_mapbycore; +}; +typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t; + +struct mca_coll_han_allgather_s { + mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; + void *sbuf; + void *sbuf_inter_free; + void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; + int rcount; + int root_low_rank; + int w_rank; + bool noop; + bool is_mapbycore; + int *topo; +}; +typedef struct mca_coll_han_allgather_s mca_coll_han_allgather_t; + +/** + * Structure to hold the han coll component. First it holds the + * base coll component, and then holds a bunch of + * han-coll-component-specific stuff (e.g., current MCA param + * values). + */ +typedef struct mca_coll_han_component_t { + /** Base coll component */ + mca_coll_base_component_2_0_0_t super; + + /** MCA parameter: Priority of this component */ + int han_priority; + /* whether output the log message */ + int han_output; + /* segment size for bcast */ + uint32_t han_bcast_segsize; + /* up level module for bcast */ + uint32_t han_bcast_up_module; + /* low level module for bcast */ + uint32_t han_bcast_low_module; + /* segment size for reduce */ + uint32_t han_reduce_segsize; + /* up level module for reduce */ + uint32_t han_reduce_up_module; + /* low level module for reduce */ + uint32_t han_reduce_low_module; + /* segment size for allreduce */ + uint32_t han_allreduce_segsize; + /* up level module for allreduce */ + uint32_t han_allreduce_up_module; + /* low level module for allreduce */ + uint32_t han_allreduce_low_module; + /* up level module for allgather */ + uint32_t han_allgather_up_module; + /* low level module for allgather */ + uint32_t han_allgather_low_module; + /* up level module for gather */ + uint32_t han_gather_up_module; + /* low level module for gather */ + uint32_t han_gather_low_module; + /* up level module for scatter */ + uint32_t han_scatter_up_module; + /* low level module for scatter */ + uint32_t han_scatter_low_module; + /* whether we need reproducible results + * (but disables topological optimisations) + */ + uint32_t han_reproducible; + bool use_simple_algorithm[COLLCOUNT]; + + /* Dynamic configuration rules */ + bool use_dynamic_file_rules; + bool dump_dynamic_rules; + char* dynamic_rules_filename; + /* Dynamic rules from file */ + mca_coll_han_dynamic_rules_t dynamic_rules; + /* Dynamic rules from mca parameter */ + COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; + + /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ + int max_dynamic_errors; +} mca_coll_han_component_t; + +typedef void (*previous_dummy_fn_t) (void); + +/* + * Structure used to store what is necessary for the collective operations + * routines in case of fallback. + */ +typedef struct mca_coll_han_single_collective_fallback_s { + union { + mca_coll_base_module_allgather_fn_t allgather; + mca_coll_base_module_allgatherv_fn_t allgatherv; + mca_coll_base_module_allreduce_fn_t allreduce; + mca_coll_base_module_bcast_fn_t bcast; + mca_coll_base_module_gather_fn_t gather; + mca_coll_base_module_reduce_fn_t reduce; + mca_coll_base_module_scatter_fn_t scatter; + previous_dummy_fn_t dummy; + }; + mca_coll_base_module_t* module; +} mca_coll_han_single_collective_fallback_t; + +/* + * The structure containing a replacement for all collective supported + * by HAN. This structure is used as a fallback during subcommunicator + * creation. + */ +typedef struct mca_coll_han_collectives_fallback_s { + mca_coll_han_single_collective_fallback_t allgather; + mca_coll_han_single_collective_fallback_t allgatherv; + mca_coll_han_single_collective_fallback_t allreduce; + mca_coll_han_single_collective_fallback_t bcast; + mca_coll_han_single_collective_fallback_t reduce; + mca_coll_han_single_collective_fallback_t gather; + mca_coll_han_single_collective_fallback_t scatter; +} mca_coll_han_collectives_fallback_t; + +/** Coll han module */ +typedef struct mca_coll_han_module_t { + /** Base module */ + mca_coll_base_module_t super; + + /* Whether this module has been lazily initialized or not yet */ + bool enabled; + + struct ompi_communicator_t **cached_low_comms; + struct ompi_communicator_t **cached_up_comms; + int *cached_vranks; + int *cached_topo; + bool is_mapbycore; + bool are_ppn_imbalanced; + + /* To be able to fallback when the cases are not supported */ + struct mca_coll_han_collectives_fallback_s fallback; + + /* To be able to fallback on reproducible algorithm */ + mca_coll_base_module_reduce_fn_t reproducible_reduce; + mca_coll_base_module_t *reproducible_reduce_module; + mca_coll_base_module_allreduce_fn_t reproducible_allreduce; + mca_coll_base_module_t *reproducible_allreduce_module; + + /* Topological level of this communicator */ + TOPO_LVL_T topologic_level; + + /* Collective module storage for module choice */ + mca_coll_han_collective_modules_storage_t modules_storage; + bool storage_initialized; + + /* + * Number of dynamic errors encountered + * The first mca_coll_han_component.max_dynamic_errors + * of rank 0 are printed with verbosity = 0 + */ + int dynamic_errors; + + /* Sub-communicator */ + struct ompi_communicator_t *sub_comm[NB_TOPO_LVL]; +} mca_coll_han_module_t; +OBJ_CLASS_DECLARATION(mca_coll_han_module_t); + +/* + * Some defines to stick to the naming used in the other components in terms of + * fallback routines + */ +#define previous_allgather fallback.allgather.allgather +#define previous_allgather_module fallback.allgather.module + +#define previous_allgatherv fallback.allgatherv.allgatherv +#define previous_allgatherv_module fallback.allgatherv.module + +#define previous_allreduce fallback.allreduce.allreduce +#define previous_allreduce_module fallback.allreduce.module + +#define previous_bcast fallback.bcast.bcast +#define previous_bcast_module fallback.bcast.module + +#define previous_reduce fallback.reduce.reduce +#define previous_reduce_module fallback.reduce.module + +#define previous_gather fallback.gather.gather +#define previous_gather_module fallback.gather.module + +#define previous_scatter fallback.scatter.scatter +#define previous_scatter_module fallback.scatter.module + + +/* macro to correctly load a fallback collective module */ +#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \ + do { \ + if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \ + OBJ_RELEASE(coll_module); \ + } \ + } while(0) + +/* macro to correctly load /all/ fallback collectives */ +#define HAN_LOAD_FALLBACK_COLLECTIVES(HANM, COMM) \ + do { \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, bcast); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, scatter); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, gather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, reduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allreduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgatherv); \ + han_module->enabled = false; /* entire module set to pass-through from now on */ \ + } while(0) + + +/** + * Global component instance + */ +OMPI_MODULE_DECLSPEC extern mca_coll_han_component_t mca_coll_han_component; + +/* + * coll module functions + */ +int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads); + +mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm, int *priority); + +int han_request_free(ompi_request_t ** request); + +/* Subcommunicator creation */ +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); + +/** + * Gather topology information + * + * Returns a pointer to the (potentially already cached) topology. + * NOTE: if the rank distribution is imbalanced, no effort will be made to gather + * the topology at all ranks and instead NULL is returned and han_module->is_mapbycore + * is set to false. + * If HAN ever learns to deal with imbalanced topologies, this needs fixing! + */ +int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, + int num_topo_level); + +/* Utils */ +static inline void +mca_coll_han_get_ranks(int *vranks, int root, int low_size, + int *root_low_rank, int *root_up_rank) +{ + *root_up_rank = vranks[root] / low_size; + *root_low_rank = vranks[root] % low_size; +} + +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); + +/** Dynamic component choice */ +/* + * Get all the collective modules initialized on this communicator + * This function must be call at the start of every selector implementation + */ +int +mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module); + +int +mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_allgatherv_intra_dynamic(ALLGATHERV_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_intra_dynamic(ALLREDUCE_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_bcast_intra_dynamic(BCAST_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_gather_intra_dynamic(GATHER_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_intra_dynamic(REDUCE_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, + mca_coll_base_module_t *module); + +/* Bcast */ +int mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); + +/* Reduce */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_han_reduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t* op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +/* Allreduce */ +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +int mca_coll_han_allreduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); + +/* Scatter */ +int +mca_coll_han_scatter_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); + +/* Gather */ +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +/* reordering after gather, for unordered ranks */ +void +ompi_coll_han_reorder_gather(const void *sbuf, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo); + + + +/* Allgather */ +int +mca_coll_han_allgather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + +#endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c new file mode 100644 index 0000000000..cc7dfaff26 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +static int mca_coll_han_allgather_lb_task(void *task_args); +static int mca_coll_han_allgather_lg_task(void *task_args); +static int mca_coll_han_allgather_uag_task(void *task_args); + +static inline void +mca_coll_han_set_allgather_args(mca_coll_han_allgather_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, + bool noop, + bool is_mapbycore, + int *topo, + ompi_request_t * req) +{ + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->topo = topo; + args->req = req; +} + +int +mca_coll_han_allgather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + int low_rank = ompi_comm_rank(low_comm); + int w_rank = ompi_comm_rank(comm); + + /* Init topo */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather with this communicator (imbalance). Fall back on another component\n")); + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } + + ompi_request_t *temp_request = NULL; + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = OMPI_REQUEST_COLL; + temp_request->req_free = han_request_free; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; + + int root_low_rank = 0; + /* Create lg (lower level gather) task */ + mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); + /* Setup lg task arguments */ + mca_coll_han_allgather_t *lg_args = malloc(sizeof(mca_coll_han_allgather_t)); + mca_coll_han_set_allgather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, + rdtype, root_low_rank, up_comm, low_comm, w_rank, + low_rank != root_low_rank, han_module->is_mapbycore, topo, + temp_request); + /* Init and issue lg task */ + init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_args)); + issue_task(lg); + + ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); + + return OMPI_SUCCESS; +} + +/* lg: lower level gather task */ +int mca_coll_han_allgather_lg_task(void *task_args) +{ + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; + char *tmp_buf = NULL, *tmp_rbuf = NULL; + char *tmp_send = NULL; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", + t->w_rank)); + + /* If the process is one of the node leader */ + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (t->rdtype, &rlb, &rext); + if (MPI_IN_PLACE == t->sbuf) { + t->sdtype = t->rdtype; + t->scount = t->rcount; + } + if (!t->noop) { + int low_size = ompi_comm_size(t->low_comm); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_rbuf = tmp_buf - rgap; + if (MPI_IN_PLACE == t->sbuf) { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + ompi_datatype_copy_content_same_ddt(t->rdtype, t->rcount, tmp_rbuf, tmp_send); + } + } + /* Lower level (shared memory or intra-node) gather */ + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_gather(MPI_IN_PLACE, t->scount, t->sdtype, + tmp_rbuf, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + t->low_comm->c_coll->coll_gather(tmp_send, t->rcount, t->rdtype, + NULL, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } + } + else { + t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, + t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_gather_module); + } + + t->sbuf = tmp_rbuf; + t->sbuf_inter_free = tmp_buf; + + /* Create uag (upper level all-gather) task */ + mca_coll_task_t *uag = t->cur_task; + /* Init and issue uag task */ + init_task(uag, mca_coll_han_allgather_uag_task, (void *) t); + issue_task(uag); + + return OMPI_SUCCESS; +} + +/* uag: upper level (inter-node) all-gather task */ +int mca_coll_han_allgather_uag_task(void *task_args) +{ + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; + + if (t->noop) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allgather: uag noop\n", t->w_rank)); + } else { + int low_size = ompi_comm_size(t->low_comm); + int up_size = ompi_comm_size(t->up_comm); + char *reorder_buf = NULL; + char *reorder_rbuf = NULL; + if (t->is_mapbycore) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: HAN Allgather is bycore: ", t->w_rank)); + reorder_rbuf = (char *) t->rbuf; + } else { + ptrdiff_t rsize, rgap = 0; + rsize = + opal_datatype_span(&t->rdtype->super, + (int64_t) t->rcount * low_size * up_size, + &rgap); + reorder_buf = (char *) malloc(rsize); + reorder_rbuf = reorder_buf - rgap; + } + + /* Inter node allgather */ + t->up_comm->c_coll->coll_allgather((char *) t->sbuf, t->scount * low_size, t->sdtype, + reorder_rbuf, t->rcount * low_size, t->rdtype, + t->up_comm, t->up_comm->c_coll->coll_allgather_module); + + if (t->sbuf_inter_free != NULL) { + free(t->sbuf_inter_free); + t->sbuf_inter_free = NULL; + } + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allgather: ug allgather finish\n", t->w_rank)); + + /* Reorder the node leader's rbuf, copy data from tmp_rbuf to rbuf */ + if (!t->is_mapbycore) { + int i, j; + ptrdiff_t rextent; + ompi_datatype_type_extent(t->rdtype, &rextent); + for (i = 0; i < up_size; i++) { + for (j = 0; j < low_size; j++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: HAN Allgather copy from %d %d\n", t->w_rank, + (i * low_size + j) * 2 + 1, + t->topo[(i * low_size + j) * 2 + 1])); + ompi_datatype_copy_content_same_ddt(t->rdtype, + (ptrdiff_t) t->rcount, + (char *) t->rbuf + + rextent * + (ptrdiff_t) t->topo[(i * low_size + j) * 2 + + 1] * + (ptrdiff_t) t->rcount, + reorder_rbuf + rextent * (i * low_size + + j) * + (ptrdiff_t) t->rcount); + } + } + free(reorder_buf); + reorder_buf = NULL; + } + } + + + /* Create lb (low level broadcast) task */ + mca_coll_task_t *lb = t->cur_task; + /* Init and issue lb task */ + init_task(lb, mca_coll_han_allgather_lb_task, (void *) t); + issue_task(lb); + + return OMPI_SUCCESS; +} + +/* lb: low level broadcast task */ +int mca_coll_han_allgather_lb_task(void *task_args) +{ + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n", + t->w_rank)); + OBJ_RELEASE(t->cur_task); + int low_size = ompi_comm_size(t->low_comm); + int up_size = ompi_comm_size(t->up_comm); + t->low_comm->c_coll->coll_bcast((char *) t->rbuf, t->rcount * low_size * up_size, t->rdtype, + t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_bcast_module); + + ompi_request_t *temp_req = t->req; + free(t); + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; + +} + +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module){ + + /* create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } + /* discovery topology */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } + + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + int w_rank = ompi_comm_rank(comm); + /* setup up/low coordinates */ + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); + int up_size = ompi_comm_size(up_comm); + int root_low_rank = 0; // node leader will be 0 on each rank + + /* allocate the intermediary buffer + * to gather on leaders on the low sub communicator */ + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (rdtype, &rlb, &rext); + char *tmp_buf = NULL; + char *tmp_buf_start = NULL; + char *tmp_send = NULL; + if (MPI_IN_PLACE == sbuf) { + scount = rcount; + sdtype = rdtype; + } + if (low_rank == root_low_rank) { + ptrdiff_t rsize, rgap = 0; + /* Compute the size to receive all the local data, including datatypes empty gaps */ + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); + /* intermediary buffer on node leaders to gather on low comm */ + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + if (MPI_IN_PLACE == sbuf) { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send); + } + } + + /* 1. low gather on node leaders into tmp_buf */ + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + low_comm->c_coll->coll_gather(MPI_IN_PLACE, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + low_comm->c_coll->coll_gather(tmp_send, rcount, rdtype, + NULL, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + } + else { + low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + /* 2. allgather between node leaders, from tmp_buf to reorder_buf */ + if (low_rank == root_low_rank) { + /* allocate buffer to store unordered result on node leaders + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns. + */ + char *reorder_buf = NULL; + char *reorder_buf_start = NULL; + if (han_module->is_mapbycore) { + reorder_buf_start = rbuf; + } else { + if (0 == low_rank && 0 == up_rank) { // first rank displays message + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Allgather needs reordering: ", up_rank)); + } + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); + reorder_buf = (char *) malloc(rsize); + reorder_buf_start = reorder_buf - rgap; + } + + /* 2a. inter node allgather */ + up_comm->c_coll->coll_allgather(tmp_buf_start, scount*low_size, sdtype, + reorder_buf_start, rcount*low_size, rdtype, + up_comm, up_comm->c_coll->coll_allgather_module); + + if (tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + + /* 2b. reorder the node leader's into rbuf. + * if ranks are not mapped in topological order, data needs to be reordered + * (see reorder_gather) + */ + if (!han_module->is_mapbycore) { + ompi_coll_han_reorder_gather(reorder_buf_start, + rbuf, rcount, rdtype, + comm, topo); + free(reorder_buf); + reorder_buf = NULL; + } + + } + + /* 3. up broadcast: leaders broadcast on their nodes */ + low_comm->c_coll->coll_bcast(rbuf, rcount*low_size*up_size, rdtype, + root_low_rank, low_comm, + low_comm->c_coll->coll_bcast_module); + + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c new file mode 100644 index 0000000000..afa0e0a220 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +static int mca_coll_han_allreduce_t0_task(void *task_args); +static int mca_coll_han_allreduce_t1_task(void *task_args); +static int mca_coll_han_allreduce_t2_task(void *task_args); +static int mca_coll_han_allreduce_t3_task(void *task_args); + +/* Only work with regular situation (each node has equal number of processes) */ + +static inline void +mca_coll_han_set_allreduce_args(mca_coll_han_allreduce_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, + int seg_count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, + int cur_seg, + int w_rank, + int last_seg_count, + bool noop, ompi_request_t * req, int *completed) +{ + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->req = req; + args->completed = completed; +} + +/* + * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: + * lr: lower level (shared-memory or intra-node) reduce, + * ur: upper level (inter-node) reduce, + * ub: upper level (inter-node) bcast, + * lb: lower level (shared-memory or intra-node) bcast. + * Hence, in each iteration, there is a combination of collective operations which is called a task. + * | seg 0 | seg 1 | seg 2 | seg 3 | + * iter 0 | lr | | | | task: t0, contains lr + * iter 1 | ur | lr | | | task: t1, contains ur and lr + * iter 2 | ub | ur | lr | | task: t2, contains ub, ur and lr + * iter 3 | lb | ub | ur | lr | task: t3, contains lb, ub, ur and lr + * iter 4 | | lb | ub | ur | task: t3, contains lb, ub and ur + * iter 5 | | | lb | ub | task: t3, contains lb and ub + * iter 6 | | | | lb | task: t3, contains lb + */ + +int +mca_coll_han_allreduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); + goto prev_allreduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } + + ptrdiff_t extent, lb; + size_t dtype_size; + ompi_datatype_get_extent(dtype, &lb, &extent); + int seg_count = count, w_rank; + w_rank = ompi_comm_rank(comm); + ompi_datatype_type_size(dtype, &dtype_size); + + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_allreduce_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_allreduce_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, dtype_size, + seg_count); + + /* Determine number of elements sent per task. */ + OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, + "In HAN Allreduce seg_size %d seg_count %d count %d\n", + mca_coll_han_component.han_allreduce_segsize, seg_count, count)); + int num_segments = (count + seg_count - 1) / seg_count; + + int low_rank = ompi_comm_rank(low_comm); + int root_up_rank = 0; + int root_low_rank = 0; + /* Create t0 task for the first segment */ + mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); + /* Setup up t0 task arguments */ + int *completed = (int *) malloc(sizeof(int)); + completed[0] = 0; + mca_coll_han_allreduce_args_t *t = malloc(sizeof(mca_coll_han_allreduce_args_t)); + mca_coll_han_set_allreduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, + root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, + w_rank, count - (num_segments - 1) * seg_count, + low_rank != root_low_rank, NULL, completed); + /* Init t0 task */ + init_task(t0, mca_coll_han_allreduce_t0_task, (void *) (t)); + /* Issure t0 task */ + issue_task(t0); + + /* Create t1 tasks for the current segment */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + /* Init t1 task */ + init_task(t1, mca_coll_han_allreduce_t1_task, (void *) t); + /* Issue t1 task */ + issue_task(t1); + + /* Create t2 tasks for the current segment */ + mca_coll_task_t *t2 = OBJ_NEW(mca_coll_task_t); + /* Setup up t2 task arguments */ + t->cur_task = t2; + /* Init t2 task */ + init_task(t2, mca_coll_han_allreduce_t2_task, (void *) t); + issue_task(t2); + + /* Create t3 tasks for the current segment */ + mca_coll_task_t *t3 = OBJ_NEW(mca_coll_task_t); + /* Setup up t3 task arguments */ + t->cur_task = t3; + /* Init t3 task */ + init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); + issue_task(t3); + + while (t->completed[0] != t->num_segments) { + /* Create t3 tasks for the current segment */ + mca_coll_task_t *t3 = OBJ_NEW(mca_coll_task_t); + /* Setup up t3 task arguments */ + t->cur_task = t3; + t->sbuf = (char *) t->sbuf + extent * t->seg_count; + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + t->cur_seg = t->cur_seg + 1; + /* Init t3 task */ + init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); + issue_task(t3); + } + free(t->completed); + t->completed = NULL; + free(t); + + return OMPI_SUCCESS; + + prev_allreduce_intra: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); +} + +/* t0 task */ +int mca_coll_han_allreduce_t0_task(void *task_args) +{ + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->rbuf, NULL, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + return OMPI_SUCCESS; +} + +/* t1 task */ +int mca_coll_han_allreduce_t1_task(void *task_args) +{ + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *ireduce_req; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ur of cur_seg */ + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_up_rank, t->up_comm, &ireduce_req, + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, + t->dtype, t->op, t->root_up_rank, t->up_comm, + &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); + } + } + /* lr of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, + (char *) t->rbuf + extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + + } + if (!t->noop) { + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); + } + + return OMPI_SUCCESS; +} + +/* t2 task */ +int mca_coll_han_allreduce_t2_task(void *task_args) +{ + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *reqs[2]; + int req_count = 0; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ub of cur_seg */ + t->up_comm->c_coll->coll_ibcast((char *) t->rbuf, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, &(reqs[0]), + t->up_comm->c_coll->coll_ibcast_module); + req_count++; + /* ur of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, + (char *) t->rbuf + extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf + extent * t->seg_count, + (char *) t->rbuf + extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } + req_count++; + } + } + /* lr of cur_seg+2 */ + if (t->cur_seg <= t->num_segments - 3) { + if (t->cur_seg == t->num_segments - 3 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + 2 * extent * t->seg_count, + (char *) t->rbuf + 2 * extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + if (!t->noop && req_count > 0) { + ompi_request_wait_all(req_count, reqs, MPI_STATUSES_IGNORE); + } + + + return OMPI_SUCCESS; +} + +/* t3 task */ +int mca_coll_han_allreduce_t3_task(void *task_args) +{ + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *reqs[2]; + int req_count = 0; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ub of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->up_comm->c_coll->coll_ibcast((char *) t->rbuf + extent * t->seg_count, t->seg_count, + t->dtype, t->root_up_rank, t->up_comm, &(reqs[0]), + t->up_comm->c_coll->coll_ibcast_module); + req_count++; + } + /* ur of cur_seg+2 */ + if (t->cur_seg <= t->num_segments - 3) { + if (t->cur_seg == t->num_segments - 3 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, + (char *) t->rbuf + 2 * extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf + 2 * extent * t->seg_count, + (char *) t->rbuf + 2 * extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } + req_count++; + } + } + /* lr of cur_seg+3 */ + if (t->cur_seg <= t->num_segments - 4) { + if (t->cur_seg == t->num_segments - 4 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + 3 * extent * t->seg_count, + (char *) t->rbuf + 3 * extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + /* lb of cur_seg */ + t->low_comm->c_coll->coll_bcast((char *) t->rbuf, t->seg_count, t->dtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_bcast_module); + if (!t->noop && req_count > 0) { + ompi_request_wait_all(req_count, reqs, MPI_STATUSES_IGNORE); + } + + t->completed[0]++; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t3 %d total %d\n", t->w_rank, t->cur_seg, + t->completed[0])); + + return OMPI_SUCCESS; +} + +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + int root_low_rank = 0; + int low_rank; + int ret; + mca_coll_han_component_t *cs = &mca_coll_han_component; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + OPAL_OUTPUT_VERBOSE((10, cs->han_output, + "[OMPI][han] in mca_coll_han_reduce_intra_simple\n")); + + // Fallback to another component if the op cannot commute + if (! ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); + goto prev_allreduce; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } + + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; + low_rank = ompi_comm_rank(low_comm); + + /* Low_comm reduce */ + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + ret = low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + else { + ret = low_comm->c_coll->coll_reduce((char *)rbuf, NULL, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + } + else { + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: low comm reduce failed. " + "Falling back to another component\n")); + goto prev_allreduce; + } + + /* Local roots perform a allreduce on the upper comm */ + if (low_rank == root_low_rank) { + ret = up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, rbuf, count, dtype, op, + up_comm, up_comm->c_coll->coll_allreduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: up comm allreduce failed. \n")); + /* + * Do not fallback in such a case: only root_low_ranks follow this + * path, the other ranks are in another collective. + * ==> Falling back would potentially lead to a hang. + * Simply return the error + */ + return ret; + } + } + + /* Low_comm bcast */ + ret = low_comm->c_coll->coll_bcast(rbuf, count, dtype, + root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: low comm bcast failed. " + "Falling back to another component\n")); + goto prev_allreduce; + } + + return OMPI_SUCCESS; + + prev_allreduce: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); +} + +/* Find a fallback on reproducible algorithm + * use tuned, or if impossible whatever available + */ +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* populate previous modules_storage*/ + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* try availability of reproducible modules*/ + int fallbacks[] = {TUNED, BASIC}; + int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks); + int i; + for (i=0; imodules_storage.modules[fallback].module_handler; + if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { + if (0 == w_rank) { + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:allreduce_reproducible: " + "fallback on %s\n", + available_components[fallback].component_name); + } + han_module->reproducible_allreduce_module = fallback_module; + han_module->reproducible_allreduce = fallback_module->coll_allreduce; + return OMPI_SUCCESS; + } + } + /* fallback of the fallback */ + if (0 == w_rank) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:allreduce_reproducible_decision: " + "no reproducible fallback\n"); + } + han_module->reproducible_allreduce_module = han_module->previous_allreduce_module; + han_module->reproducible_allreduce = han_module->previous_allreduce; + return OMPI_SUCCESS; +} + +/* Fallback on reproducible algorithm */ +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + return han_module->reproducible_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->reproducible_allreduce_module); +} diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c new file mode 100644 index 0000000000..c32ea745b0 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +static int mca_coll_han_bcast_t0_task(void *task_args); +static int mca_coll_han_bcast_t1_task(void *task_args); + +static inline void +mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t * cur_task, void *buff, + int seg_count, struct ompi_datatype_t *dtype, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) +{ + args->cur_task = cur_task; + args->buff = buff; + args->seg_count = seg_count; + args->dtype = dtype; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; +} + +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: + * ub: upper level (inter-node) bcast + * lb: low level (shared-memory or intra-node) bcast. + * Hence, in each iteration, there is a combination of collective operations which is called a task. + * | seg 0 | seg 1 | seg 2 | seg 3 | + * iter 0 | ub | | | | task: t0, contains ub + * iter 1 | lb | ub | | | task: t1, contains ub and lb + * iter 2 | | lb | ub | | task: t1, contains ub and lb + * iter 3 | | | lb | ub | task: t1, contains ub and lb + * iter 4 | | | | lb | task: t1, contains lb + */ +int +mca_coll_han_bcast_intra(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int err, seg_count = count, w_rank = ompi_comm_rank(comm); + ompi_communicator_t *low_comm, *up_comm; + ptrdiff_t extent, lb; + size_t dtype_size; + + /* Create the subcommunicators */ + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } + + ompi_datatype_get_extent(dtype, &lb, &extent); + ompi_datatype_type_size(dtype, &dtype_size); + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, dtype_size, + seg_count); + + int num_segments = (count + seg_count - 1) / seg_count; + OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, + "In HAN seg_count %d count %d num_seg %d\n", + seg_count, count, num_segments)); + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + + int root_low_rank, root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, + root_up_rank)); + + /* Create t0 tasks for the first segment */ + mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); + /* Setup up t0 task arguments */ + mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t)); + mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype, + root_up_rank, root_low_rank, up_comm, low_comm, + num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, + low_rank != root_low_rank); + /* Init the first task */ + init_task(t0, mca_coll_han_bcast_t0_task, (void *) t); + issue_task(t0); + + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_bcast_t1_task, (void *) t); + issue_task(t1); + + while (t->cur_seg <= t->num_segments - 2) { + /* Create t1 task */ + t->cur_task = t1 = OBJ_NEW(mca_coll_task_t); + t->buff = (char *) t->buff + extent * seg_count; + t->cur_seg = t->cur_seg + 1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_bcast_t1_task, (void *) t); + issue_task(t1); + } + + free(t); + + return OMPI_SUCCESS; +} + +/* t0 task: issue and wait for the upper level ibcast of segment 0 */ +int mca_coll_han_bcast_t0_task(void *task_args) +{ + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + if (t->noop) { + return OMPI_SUCCESS; + } + t->up_comm->c_coll->coll_bcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, t->up_comm->c_coll->coll_bcast_module); + return OMPI_SUCCESS; +} + +/* t1 task: + * 1. issue the upper level ibcast of segment cur_seg + 1 + * 2. issue the low level bcast of segment cur_seg + * 3. wait for the completion of the ibcast + */ +int mca_coll_han_bcast_t1_task(void *task_args) +{ + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + ompi_request_t *ibcast_req = NULL; + int tmp_count = t->seg_count; + ptrdiff_t extent, lb; + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + ompi_datatype_get_extent(t->dtype, &lb, &extent); + if (!t->noop) { + if (t->cur_seg <= t->num_segments - 2 ) { + if (t->cur_seg == t->num_segments - 2) { + tmp_count = t->last_seg_count; + } + t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count, + tmp_count, t->dtype, t->root_up_rank, + t->up_comm, &ibcast_req, + t->up_comm->c_coll->coll_ibcast_module); + } + } + + /* are we the last segment to be pushed downstream ? */ + tmp_count = (t->cur_seg == (t->num_segments - 1)) ? t->last_seg_count : t->seg_count; + t->low_comm->c_coll->coll_bcast((char *) t->buff, + tmp_count, t->dtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_bcast_module); + + if (NULL != ibcast_req) { + ompi_request_wait(&ibcast_req, MPI_STATUS_IGNORE); + } + + return OMPI_SUCCESS; +} + +int +mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + /* create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + ompi_communicator_t *low_comm, *up_comm; + int err, w_rank = ompi_comm_rank(comm); + + /* Create the subcommunicators */ + err = mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } + + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int root_low_rank, root_up_rank; + + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", + w_rank, root_low_rank, root_up_rank)); + + if (low_rank == root_low_rank) { + up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, + up_comm, up_comm->c_coll->coll_bcast_module); + + /* To remove when han has better sub-module selection. + For now switching to ibcast enables to make runs with libnbc. */ + //ompi_request_t req; + //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, + // up_comm, &req, up_comm->c_coll->coll_ibcast_module); + //ompi_request_wait(&req, MPI_STATUS_IGNORE); + + } + low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, + low_comm, low_comm->c_coll->coll_bcast_module); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c new file mode 100644 index 0000000000..ef55a6ac99 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_component.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Most of the description of the data layout is in the + * coll_han_module.c file. + */ + +#include "ompi_config.h" + +#include "opal/util/show_help.h" +#include "ompi/constants.h" +#include "ompi/mca/coll/coll.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" +#include "coll_han_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_util.h" + +/* + * Public string showing the coll ompi_han component version number + */ +const char *mca_coll_han_component_version_string = + "Open MPI HAN collective MCA component version " OMPI_VERSION; + +ompi_coll_han_components available_components[COMPONENTS_COUNT] = { + { SELF, "self", NULL }, + { BASIC, "basic", NULL }, + { LIBNBC, "libnbc", NULL }, + { TUNED, "tuned", NULL }, + { SM, "sm", NULL }, + { SHARED, "shared", NULL }, + { ADAPT, "adapt", NULL }, + { HAN, "han", NULL } +}; + +/* + * Local functions + */ +static int han_open(void); +static int han_close(void); +static int han_register(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_coll_han_component_t mca_coll_han_component = { + /* First, fill in the super */ + { + /* First, the mca_component_t struct containing meta + information about the component itself */ + + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + .mca_component_name = "han", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + + /* Component functions */ + .mca_open_component = han_open, + .mca_close_component = han_close, + .mca_register_component_params = han_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, + + /* Initialization / querying functions */ + + .collm_init_query = mca_coll_han_init_query, + .collm_comm_query = mca_coll_han_comm_query, + }, + + /* han-component specifc information */ + + /* (default) priority */ + 20, +}; + +/* + * Init the component + */ +static int han_open(void) +{ + /* Get the global coll verbosity: it will be ours */ + mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; + + return mca_coll_han_init_dynamic_rules(); +} + + +/* + * Shut down the component + */ +static int han_close(void) +{ + mca_coll_han_free_dynamic_rules(); + return OMPI_SUCCESS; +} + +static bool is_simple_implemented(COLLTYPE_T coll) +{ + switch(coll) { + case ALLGATHER: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + return true; + default: + return false; + } +} + +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) +{ + switch(topo_lvl) { + case INTRA_NODE: + return "intra_node"; + case INTER_NODE: + return "inter_node"; + case GLOBAL_COMMUNICATOR: + return "global_communicator"; + case NB_TOPO_LVL: + default: + return "invalid topologic level"; + } +} + + +/* + * Register MCA params + */ +static int han_register(void) +{ + mca_base_component_t *c = &mca_coll_han_component.super.collm_version; + mca_coll_han_component_t *cs = &mca_coll_han_component; + + /* Generated parameters name and description */ + char param_name[128], param_desc[256]; + int param_desc_size; + COLLTYPE_T coll; + TOPO_LVL_T topo_lvl; + COMPONENT_T component; + + cs->han_priority = 0; + (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); + + cs->han_bcast_segsize = 65536; + (void) mca_base_component_var_register(c, "bcast_segsize", + "segment size for bcast", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_segsize); + + cs->han_bcast_up_module = 0; + (void) mca_base_component_var_register(c, "bcast_up_module", + "up level module for bcast, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_up_module); + + cs->han_bcast_low_module = 0; + (void) mca_base_component_var_register(c, "bcast_low_module", + "low level module for bcast, 0 sm, 1 solo", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_low_module); + + cs->han_reduce_segsize = 524288; + (void) mca_base_component_var_register(c, "reduce_segsize", + "segment size for reduce", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_segsize); + + cs->han_reduce_up_module = 0; + (void) mca_base_component_var_register(c, "reduce_up_module", + "up level module for allreduce, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_up_module); + + cs->han_reduce_low_module = 0; + (void) mca_base_component_var_register(c, "reduce_low_module", + "low level module for allreduce, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module); + cs->han_allreduce_segsize = 524288; + (void) mca_base_component_var_register(c, "allreduce_segsize", + "segment size for allreduce", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_segsize); + + cs->han_allreduce_up_module = 0; + (void) mca_base_component_var_register(c, "allreduce_up_module", + "up level module for allreduce, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_up_module); + + cs->han_allreduce_low_module = 0; + (void) mca_base_component_var_register(c, "allreduce_low_module", + "low level module for allreduce, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module); + + cs->han_allgather_up_module = 0; + (void) mca_base_component_var_register(c, "allgather_up_module", + "up level module for allgather, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_up_module); + + cs->han_allgather_low_module = 0; + (void) mca_base_component_var_register(c, "allgather_low_module", + "low level module for allgather, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module); + + cs->han_gather_up_module = 0; + (void) mca_base_component_var_register(c, "gather_up_module", + "up level module for gather, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_up_module); + + cs->han_gather_low_module = 0; + (void) mca_base_component_var_register(c, "gather_low_module", + "low level module for gather, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_low_module); + + cs->han_scatter_up_module = 0; + (void) mca_base_component_var_register(c, "scatter_up_module", + "up level module for scatter, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_up_module); + + cs->han_scatter_low_module = 0; + (void) mca_base_component_var_register(c, "scatter_low_module", + "low level module for scatter, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module); + + cs->han_reproducible = 0; + (void) mca_base_component_var_register(c, "reproducible", + "whether we need reproducible results " + "(enabling this disables optimisations using topology)" + "0 disable 1 enable, default 0", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible); + /* Simple algorithms MCA parameters */ + for(coll = 0 ; coll < COLLCOUNT ; coll++) { + cs->use_simple_algorithm[coll] = false; + if(is_simple_implemented(coll)) { + snprintf(param_name, sizeof(param_name), "use_simple_%s", + mca_coll_base_colltype_to_str(coll)); + snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s", + mca_coll_base_colltype_to_str(coll)); + mca_base_component_var_register(c, param_name, + param_desc, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->use_simple_algorithm[coll])); + } + } + + /* Dynamic rules MCA parameters */ + memset(cs->mca_rules, 0, + COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); + for(coll = 0; coll < COLLCOUNT; coll++) { + if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { + continue; + } + /* + * Default values + */ + cs->mca_rules[coll][INTRA_NODE] = TUNED; + cs->mca_rules[coll][INTER_NODE] = BASIC; + cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; + + for(topo_lvl = 0; topo_lvl < NB_TOPO_LVL; topo_lvl++) { + + snprintf(param_name, sizeof(param_name), "%s_dynamic_%s_module", + mca_coll_base_colltype_to_str(coll), + mca_coll_han_topo_lvl_to_str(topo_lvl)); + + param_desc_size = snprintf(param_desc, sizeof(param_desc), + "Collective module to use for %s on %s topological level: ", + mca_coll_base_colltype_to_str(coll), + mca_coll_han_topo_lvl_to_str(topo_lvl)); + /* + * Exhaustive description: + * 0 = self; 1 = basic; 2 = libnbc; ... + * FIXME: Do not print component not providing this collective + */ + for(component = 0 ; component < COMPONENTS_COUNT ; component++) { + if(HAN == component && GLOBAL_COMMUNICATOR != topo_lvl) { + /* Han can only be used on the global communicator */ + continue; + } + param_desc_size += snprintf(param_desc+param_desc_size, sizeof(param_desc) - param_desc_size, + "%d = %s; ", + component, + available_components[component].component_name); + } + + mca_base_component_var_register(c, param_name, param_desc, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->mca_rules[coll][topo_lvl])); + } + } + + /* Dynamic rules */ + cs->use_dynamic_file_rules = false; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "use_dynamic_file_rules", + "Enable the dynamic selection provided via the dynamic_rules_filename MCA", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->use_dynamic_file_rules)); + + cs->dynamic_rules_filename = NULL; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "dynamic_rules_filename", + "Configuration file containing the dynamic selection rules", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->dynamic_rules_filename)); + + cs->dump_dynamic_rules = false; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "dump_dynamic_rules", + "Switch used to decide if we dump dynamic rules provided by configuration file", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->dump_dynamic_rules)); + + if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) + && !cs->use_dynamic_file_rules) { + opal_output_verbose(0, cs->han_output, + "HAN: dynamic rules for collectives are hot activated." + "Check coll_han_use_dynamic_file_rules MCA parameter"); + } + + cs->max_dynamic_errors = 10; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "max_dynamic_errors", + "Number of dynamic rules module/function " + "errors printed on rank 0 " + "with a 0 verbosity." + "Useless if coll_base_verbose is 30 or more.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->max_dynamic_errors)); + + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c new file mode 100644 index 0000000000..d32b12fbcd --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -0,0 +1,1069 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/class/opal_list.h" +#include "ompi/mca/coll/han/coll_han.h" +#include "ompi/mca/coll/han/coll_han_dynamic.h" +#include "ompi/mca/coll/base/coll_base_util.h" + +/* + * Tests if a dynamic collective is implemented + * Usefull for file reading warnings and MCA parameter generation + * When a new dynamic collective is implemented, this function must + * return true for it + */ +bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id) +{ + switch (coll_id) { + case ALLGATHER: + case ALLGATHERV: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + case SCATTER: + return true; + default: + return false; + } +} + +COMPONENT_T +mca_coll_han_component_name_to_id(const char* name) +{ + if(NULL == name) { + return -1; + } + + for( int i = SELF; i < COMPONENTS_COUNT ; i++ ) { + if (0 == strcmp(name, available_components[i].component_name)) { + return i; + } + } + return -1; +} + +/* + * Get all the collective modules initialized on this communicator + * This function must be called at the start of every selector implementation + * Note that han module may be not yet enabled + */ +int +mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + mca_coll_base_module_t *han_base_module = (mca_coll_base_module_t *) han_module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + int nb_modules = 0; + mca_coll_base_avail_coll_t *item; + + /* If the modules are get yet, return success */ + if(han_module->storage_initialized) { + return OMPI_SUCCESS; + } + /* This list is populated at communicator creation */ + OPAL_LIST_FOREACH(item, + comm->c_coll->module_list, + mca_coll_base_avail_coll_t) { + mca_coll_base_module_t *module = item->ac_module; + const char *name = item->ac_component_name; + int id = mca_coll_han_component_name_to_id(name); + + if(id >= 0 && NULL != module && module != han_base_module) { + /* + * The identifier is correct + * Store the module + */ + han_module->modules_storage.modules[id].module_handler = module; + opal_output_verbose(80, mca_coll_han_component.han_output, + "coll:han:get_all_coll_modules HAN found module %s with id %d " + "for topological level %d (%s) for communicator (%d/%s)\n", + name, id, topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + nb_modules++; + } + } + + /* + * Add han_module on global communicator only + * to prevent any recursive call + */ + if(GLOBAL_COMMUNICATOR == han_module->topologic_level) { + han_module->modules_storage.modules[HAN].module_handler = han_base_module; + nb_modules++; + } + + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_all_coll_modules HAN sub-communicator modules storage " + "for topological level %d (%s) gets %d modules " + "for communicator (%d/%s)\n", + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + nb_modules, comm->c_contextid, comm->c_name); + + assert(0 != nb_modules); + + /* The modules are get */ + han_module->storage_initialized = true; + return OMPI_SUCCESS; +} + +/* + * Find the correct rule in the dynamic rules + * Assume rules are sorted by increasing value + */ +static const msg_size_rule_t* +get_dynamic_rule(COLLTYPE_T collective, + size_t msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + /* Indexes of the rule */ + int coll_idx, topo_idx; + int conf_idx, msg_size_idx; + + /* Aliases */ + const mca_coll_han_dynamic_rules_t *dynamic_rules = NULL; + const collective_rule_t *coll_rule = NULL; + const topologic_rule_t *topo_rule = NULL; + const configuration_rule_t *conf_rule = NULL; + const msg_size_rule_t *msg_size_rule = NULL; + + const TOPO_LVL_T topo_lvl = han_module->topologic_level; + const int comm_size = ompi_comm_size(comm); + + COMPONENT_T component; + + /* Find the collective rule */ + dynamic_rules = &(mca_coll_han_component.dynamic_rules); + for(coll_idx = dynamic_rules->nb_collectives-1; + coll_idx >= 0; coll_idx--) { + if(dynamic_rules->collective_rules[coll_idx].collective_id == collective) { + coll_rule = &(dynamic_rules->collective_rules[coll_idx]); + break; + } + } + if(coll_idx < 0) { /* No dynamic rules for this collective */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule HAN searched for collective %d (%s) " + "but did not find any rule for this collective\n", + collective, mca_coll_base_colltype_to_str(collective)); + return NULL; + } + + /* Find the topologic level rule */ + for(topo_idx = coll_rule->nb_topologic_levels-1; + topo_idx >= 0; topo_idx--) { + if(coll_rule->topologic_rules[topo_idx].topologic_level == topo_lvl) { + topo_rule = &(coll_rule->topologic_rules[topo_idx]); + break; + } + } + if(topo_idx < 0) { /* No topologic level rules for this collective */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule HAN searched for topologic level %d (%s) rule " + "for collective %d (%s) but did not find any rule\n", + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + collective, mca_coll_base_colltype_to_str(collective)); + return NULL; + } + + /* Find the configuration rule */ + for(conf_idx = topo_rule->nb_rules-1; + conf_idx >= 0; conf_idx--) { + if(topo_rule->configuration_rules[conf_idx].configuration_size <= comm_size) { + conf_rule = &(topo_rule->configuration_rules[conf_idx]); + break; + } + } + if(conf_idx < 0) { + /* No corresponding configuration. Should not have happen with a correct file */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "but did not manage to find anything. " + "This is the result of an invalid configuration file: " + "the first configuration size of each collective must be 1\n", + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), comm_size); + return NULL; + } + + /* Find the message size rule */ + for(msg_size_idx = conf_rule->nb_msg_size-1; + msg_size_idx >= 0; msg_size_idx--) { + if(conf_rule->msg_size_rules[msg_size_idx].msg_size <= msg_size) { + msg_size_rule = &(conf_rule->msg_size_rules[msg_size_idx]); + break; + } + } + if(msg_size_idx < 0) { + /* No corresponding message size. Should not happen with a correct file */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message but did not manage to find anything. " + "This is the result of an invalid configuration file: " + "the first message size of each configuration must be 0\n", + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size); + + return NULL; + } + + component = msg_size_rule->component; + /* + * We have the final rule to use + * Module correctness is checked outside + */ + opal_output_verbose(80, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message. Found a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message : component %d (%s)\n", + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size, msg_size_rule->collective_id, + mca_coll_base_colltype_to_str(msg_size_rule->collective_id), + msg_size_rule->topologic_level, + mca_coll_han_topo_lvl_to_str(msg_size_rule->topologic_level), + msg_size_rule->configuration_size, + msg_size_rule->msg_size, component, available_components[component].component_name); + + return msg_size_rule; +} + +/* + * Return the module to use for the collective coll_id + * for a msg_size sized message on the comm communicator + * following the dynamic rules + */ +static mca_coll_base_module_t* +get_module(COLLTYPE_T coll_id, + size_t msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + const msg_size_rule_t *dynamic_rule; + TOPO_LVL_T topo_lvl; + COMPONENT_T mca_rule_component; + + topo_lvl = han_module->topologic_level; + mca_rule_component = mca_coll_han_component.mca_rules[coll_id][topo_lvl]; + + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* Find the correct dynamic rule to check */ + dynamic_rule = get_dynamic_rule(coll_id, + msg_size, + comm, + han_module); + if(NULL != dynamic_rule) { + /* Use dynamic rule from file */ + return han_module->modules_storage.modules[dynamic_rule->component].module_handler; + } + /* + * No dynamic rule from file + * Use rule from mca parameter + */ + if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { + /* + * Invalid MCA parameter value + * Warn the user and return NULL + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:get_module Invalid MCA parameter value %d " + "for collective %d (%s) on topologic level %d (%s)\n", + mca_rule_component, coll_id, + mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl)); + return NULL; + } + return han_module->modules_storage.modules[mca_rule_component].module_handler; +} + + +/* + * Allgather selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgather_fn_t allgather; + mca_coll_base_module_t *sub_module; + size_t dtype_size; + int rank, verbosity = 0; + + /* Compute configuration information for dynamic rules */ + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } + sub_module = get_module(ALLGATHER, + dtype_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgather_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHER: No module found for the sub-communicator. " + "Falling back to another component\n")); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; + } else if (NULL == sub_module->coll_allgather) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgather_intra_dynamic HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHER: the module found for the sub-communicator" + " cannot handle the ALLGATHER operation. Falling back to another component\n")); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allgather is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { + allgather = mca_coll_han_allgather_intra_simple; + } else { + allgather = mca_coll_han_allgather_intra; + } + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgather = sub_module->coll_allgather; + } + return allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); +} + + +/* + * Allgatherv selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + * The allgatherv size is the size of the biggest segment + */ +int +mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *displs, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgatherv_fn_t allgatherv; + int rank, verbosity = 0, comm_size, i; + mca_coll_base_module_t *sub_module; + size_t dtype_size, msg_size = 0; + + /* Compute configuration information for dynamic rules */ + comm_size = ompi_comm_size(comm); + ompi_datatype_type_size(rdtype, &dtype_size); + + for(i = 0; i < comm_size; i++) { + if(dtype_size * rcounts[i] > msg_size) { + msg_size = dtype_size * rcounts[i]; + } + } + + sub_module = get_module(ALLGATHERV, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHERV: No module found for the sub-communicator. " + "Falling back to another component\n")); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; + } else if (NULL == sub_module->coll_allgatherv) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHERV: the module found for the sub-" + "communicator cannot handle the ALLGATHERV operation. " + "Falling back to another component\n")); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module->previous_allgatherv_module); + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allgatherv is valid and point to this function + * Call han topological collective algorithm + */ + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "HAN used for collective %d (%s) with topological level %d (%s) " + "on communicator (%d/%s) but this module cannot handle " + "this collective on this topologic level\n", + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgatherv is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgatherv = sub_module->coll_allgatherv; + } + return allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + sub_module); +} + + +/* + * Allreduce selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_allreduce_intra_dynamic(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allreduce_fn_t allreduce; + mca_coll_base_module_t *sub_module; + size_t dtype_size; + int rank, verbosity = 0; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + dtype_size = dtype_size * count; + + sub_module = get_module(ALLREDUCE, + dtype_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allreduce_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLREDUCE: No module found for the sub-communicator. " + "Falling back to another component\n")); + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; + } else if (NULL == sub_module->coll_allreduce) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allreduce_intra_dynamic " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLREDUCE: the module found for the sub-" + "communicator cannot handle the ALLREDUCE operation. " + "Falling back to another component\n")); + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* Reproducibility: fallback on reproducible algo */ + if (mca_coll_han_component.han_reproducible) { + allreduce = mca_coll_han_allreduce_reproducible; + } else { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allreduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } + } + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allreduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allreduce = mca_coll_han_allreduce_intra; + } + return allreduce(sbuf, rbuf, count, dtype, + op, comm, sub_module); +} + + +/* + * Bcast selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_bcast_intra_dynamic(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_bcast_fn_t bcast; + mca_coll_base_module_t *sub_module; + size_t dtype_size; + int rank, verbosity = 0; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + dtype_size = dtype_size * count; + + sub_module = get_module(BCAST, + dtype_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_bcast_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/BCAST: No module found for the sub-communicator. " + "Falling back to another component\n")); + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; + } else if (NULL == sub_module->coll_bcast) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_bcast_intra_dynamic " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/BCAST: the module found for the sub-" + "communicator cannot handle the BCAST operation. " + "Falling back to another component\n")); + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_bcast is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[BCAST]) { + bcast = mca_coll_han_bcast_intra_simple; + } else { + bcast = mca_coll_han_bcast_intra; + } + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_bcast is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + bcast = sub_module->coll_bcast; + } + return bcast(buff, count, dtype, + root, comm, sub_module); +} + + +/* + * Gather selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_gather_fn_t gather; + mca_coll_base_module_t *sub_module; + size_t dtype_size; + int rank, verbosity = 0; + + /* Compute configuration information for dynamic rules */ + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } + + sub_module = get_module(GATHER, + dtype_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_gather_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/GATHER: No module found for the sub-communicator. " + "Falling back to another component\n")); + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; + } else if (NULL == sub_module->coll_gather) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_gather_intra_dynamic " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/GATHER: the module found for the sub-" + "communicator cannot handle the GATHER operation. " + "Falling back to another component\n")); + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_gather is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[GATHER]) { + gather = mca_coll_han_gather_intra_simple; + } else { + gather = mca_coll_han_gather_intra; + } + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_gather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + gather = sub_module->coll_gather; + } + return gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); +} + + +/* + * Reduce selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_reduce_intra_dynamic(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_reduce_fn_t reduce; + mca_coll_base_module_t *sub_module; + size_t dtype_size; + int rank, verbosity = 0; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + dtype_size = dtype_size * count; + + sub_module = get_module(REDUCE, + dtype_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_reduce_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: No module found for the sub-communicator. " + "Falling back to another component\n")); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; + } else if (NULL == sub_module->coll_reduce) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_reduce_intra_dynamic " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: the module found for the sub-" + "communicator cannot handle the REDUCE operation. " + "Falling back to another component\n")); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* Reproducibility: fallback on reproducible algo */ + if (mca_coll_han_component.han_reproducible) { + reduce = mca_coll_han_reduce_reproducible; + } else { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_reduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } + } + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_reduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + reduce = sub_module->coll_reduce; + } + return reduce(sbuf, rbuf, count, dtype, + op, root, comm, sub_module); +} + + +/* + * Scatter selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_scatter_fn_t scatter; + mca_coll_base_module_t *sub_module; + size_t dtype_size; + int rank, verbosity = 0; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + + sub_module = get_module(SCATTER, + dtype_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_scatter_intra_dynamic " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/SCATTER: No module found for the sub-communicator. " + "Falling back to another component\n")); + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; + } else if (NULL == sub_module->coll_scatter) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_scatter_intra_dynamic " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " + "Please check dynamic file/mca parameters\n", + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/SCATTER: the module found for the sub-" + "communicator cannot handle the SCATTER operation. " + "Falling back to another component\n")); + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_scatter is valid and point to this function + * Call han topological collective algorithm + */ + scatter = mca_coll_han_scatter_intra; + /* + * TODO: Uncomment when scatter simple is merged + * if(mca_coll_han_component.use_simple_algorithm[SCATTER]) { + * scatter = mca_coll_han_scatter_intra_simple; + * } else { + * scatter = mca_coll_han_scatter_intra; + * } + */ + } else { + scatter = sub_module->coll_scatter; + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_scatter is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); +} diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h new file mode 100644 index 0000000000..0ccecb63ba --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -0,0 +1,214 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_DYNAMIC_H +#define MCA_COLL_HAN_DYNAMIC_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/han/coll_han.h" + + +/* + * ################################################# + * # Dynamic rules global architecture description # + * ################################################# + * + * Han dynamic rules allow the user to define the collective + * module to call depending on the topological configuration of the + * sub-communicators and the collective parameters. This mechanism + * can also be used to fallback to the main collective on another module. + * The interface is described in coll_han_dynamic_file.h. + * + * ############################# + * # Collective module storage # + * ############################# + * To be able to switch between multiple collective modules, han + * directly accesses the module on the communicator. This information is + * stored in the collective structure of the communicator during the collective + * module choice at the communicator initialization. When han needs this + * information for the first time, it identifies the modules by their name and + * stores them in its module structure. + * Then, the modules are identified by their identifier. + * + * ######################### + * # Dynamic rules storage # + * ######################### + * There are two types of dynamic rules: + * - MCA parameter defined rules + * - File defined rules + * + * MCA parameter defined rules are stored in mca_coll_han_component.mca_rules. + * This is a double indexed table. The first index is the coresponding collective + * communication and the second index is the topological level aimed by the rule. + * These parameters define the collective component to use for a specific + * collective communication on a specific topologic level. + * + * File defined rules are stored in mca_coll_han_component.dynamic_rules. + * These structures are defined bellow. The rule storage is directy deduced + * from the rule file format. + * + * File defined rules precede MCA parameter defined rules. + * + * ####################### + * # Dynamic rules usage # + * ####################### + * To choose which collective module to use on a specific configuration, han + * adds an indirection on the collective call: dynamic choice functions. These + * functions do not implement any collective. First, they try to find a dynamic + * rule from file for the given collective. If there is not any rule for the + * given configuration, MCA parameter defined rules are used. Once the module + * to use is found, the correct collective implementation is called. + * + * This indirection is also used on the global communicator. This allows han + * to provide a fallback mechanism considering the collective parameters. + * + * ############################## + * # Dynamic rules choice logic # + * ############################## + * Dynamic rules choice is made with a stack logic. Each new rule precedes + * already defined rules. MCA parameters rules are the stack base. When + * a rule is needed, rules are read as a stack and the first corresponding + * encountered is chosen. + * + * Consequences: + * - If a collective identifier appears multiple times, only the last + * will be considered + * - If a topological level appears multiple times for a collective, + * only the last will be considered + * - If configuration rules or message size rules are not stored + * by increasing value, some of them will not be considered + */ + +/* Dynamic rules support */ +typedef enum COMPONENTS { + SELF = 0, + BASIC, + LIBNBC, + TUNED, + SM, + SHARED, + ADAPT, + HAN, + COMPONENTS_COUNT +} COMPONENT_T; + +typedef struct { + COMPONENT_T id; + char* component_name; + mca_coll_base_component_t* component; +} ompi_coll_han_components; + +extern ompi_coll_han_components available_components[COMPONENTS_COUNT]; + +/* Topologic levels */ +typedef enum TOPO_LVL { + INTRA_NODE = 0, + INTER_NODE, + /* Identifies the global communicator as a topologic level */ + GLOBAL_COMMUNICATOR, + NB_TOPO_LVL +} TOPO_LVL_T; + +/* Rule for a specific msg size + * in a specific configuration + * for a specific collective + * in a specific topologic level */ +typedef struct msg_size_rule_s { + COLLTYPE_T collective_id; + TOPO_LVL_T topologic_level; + int configuration_size; + + /* Message size of the rule */ + size_t msg_size; + + /* Component to use on this specific configuration + * and message size */ + COMPONENT_T component; +} msg_size_rule_t; + +/* Rule for a specific configuration + * considering a specific collective + * in a specific topologic level */ +typedef struct configuration_rule_s { + COLLTYPE_T collective_id; + TOPO_LVL_T topologic_level; + + /* Number of elements of the actual topologic level + * per element of the upper topologic level */ + int configuration_size; + + /* Number of message size rules for this configuration */ + int nb_msg_size; + + /* Table of message size rules for this configuration */ + msg_size_rule_t *msg_size_rules; +} configuration_rule_t; + +/* Set of dynamic rules for a specific collective + * in a specific topologic level */ +typedef struct topologic_rule_s { + /* Collective identifier */ + COLLTYPE_T collective_id; + + /* Topologic level of the rule */ + TOPO_LVL_T topologic_level; + + /* Rule number */ + int nb_rules; + + /* Table of configuration rules + * for this collective on this topologic level */ + configuration_rule_t *configuration_rules; +} topologic_rule_t; + +/* Set of dynamic rules for a collective */ +typedef struct collective_rule_s { + COLLTYPE_T collective_id; + + /* Number of topologic level for this collective */ + int nb_topologic_levels; + + /* Table of topologic level rules + * for this collective */ + topologic_rule_t *topologic_rules; +} collective_rule_t; + +/* Global dynamic rules structure */ +typedef struct mca_coll_han_dynamic_rule_s { + int nb_collectives; + collective_rule_t *collective_rules; +} mca_coll_han_dynamic_rules_t; + +/* Module storage */ +typedef struct collective_module_storage_s { + /* Module */ + mca_coll_base_module_t *module_handler; +} collective_module_storage_t; + +/* Table of module storage */ +typedef struct mca_coll_han_collective_modules_storage_s { + /* */ + collective_module_storage_t modules[COMPONENTS_COUNT]; +} mca_coll_han_collective_modules_storage_t; + +/* Tests if a dynamic collective is implemented */ +bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); +COMPONENT_T mca_coll_han_component_name_to_id(const char* name); + +#endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c new file mode 100644 index 0000000000..e6673cf941 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -0,0 +1,606 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_STDIO_H +#include +#endif + +#include "ompi_config.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" +#include "coll_han_dynamic_file.h" + +#include "ompi/mca/coll/base/coll_base_util.h" + +#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) +#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval) +#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval) + +static void check_dynamic_rules(void); + +/* Current file line for verbose message */ +static int fileline = 1; + +int +mca_coll_han_init_dynamic_rules(void) +{ + /* File management */ + const char *fname; + FILE *fptr = NULL; + int nb_entries = 0, rc; + + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + long nb_coll, coll_id; + char * coll_name = NULL; + collective_rule_t *coll_rules; + + /* Topo informations */ + long nb_topo, topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + long nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + long nb_msg_size; + size_t msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + long component; + + /* If the dynamic rules are not used, do not even read the file */ + if(!mca_coll_han_component.use_dynamic_file_rules) { + nb_coll = 0; + return OMPI_SUCCESS; + } + + if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but " + "coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n"); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + return OMPI_SUCCESS; + } + + if( NULL == (fptr = fopen(fname, "r")) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by " + "coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and " + "check file permissions. Rules from MCA parameters will be used instead\n", + fname); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + return OMPI_SUCCESS; + } + + /* The first information of the file is the collective count */ + if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for collective count " + "or the reader encountered an unexpected EOF\n", + fname, fileline, nb_coll); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + goto file_reading_error; + } + + mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll; + + /* Allocate collective rules */ + coll_rules = malloc(nb_coll * sizeof(collective_rule_t)); + mca_coll_han_component.dynamic_rules.collective_rules = coll_rules; + if(NULL == coll_rules) { + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + goto cannot_allocate; + } + + /* Iterates on collective rules */ + for( i = 0 ; i < nb_coll ; i++ ) { + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + + /* Get the collective identifier */ + if( getnext_string(fptr, &coll_name) < 0 ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d." + "The rest of the input file will be ignored.\n", + fileline); + goto file_reading_error; + } + coll_id = mca_coll_base_name_to_colltype(coll_name); + if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) { + /* maybe the file was in the old format and we read the collective index instead of the name. */ + char* endp; + coll_id = strtol(coll_name, &endp, 10); + if( '\0' != *endp ) { /* there is garbage in the input */ + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules invalid collective %s " + "at line %d: the collective must be at least %d and less than %d. " + "The rest of the input file will be ignored.\n", + coll_name, fileline, ALLGATHER, COLLCOUNT); + goto file_reading_error; + } + free(coll_name); + coll_name = mca_coll_base_colltype_to_str(coll_id); + } + + if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "read collective id %ld at line %d but this collective is not implemented yet. " + "This is not an error but this set of rules will not be used\n", + fname, coll_id, fileline); + } + + /* + * The first information of a collective rule + * is the number of topologic rules + */ + if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count " + "or the reader encountered an unexpected EOF\n", + fname, fileline, nb_topo); + goto file_reading_error; + } + + /* Store the collective rule informations */ + coll_rules[i].nb_topologic_levels = nb_topo; + coll_rules[i].collective_id = (COLLTYPE_T)coll_id; + + if(0 == nb_topo) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count\n", + fname, fileline, nb_topo); + continue; + } + + /* Allocate topologic rules */ + topo_rules = malloc(nb_topo * sizeof(topologic_rule_t)); + coll_rules[i].topologic_rules = topo_rules; + if(NULL == topo_rules) { + coll_rules[i].nb_topologic_levels = 0; + goto cannot_allocate; + } + + /* Iterates on topologic rules */ + for( j = 0 ; j < nb_topo ; j++ ) { + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + + /* Get the topologic level identifier */ + if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. " + "Topologic level must be at least %d and less than %d\n", + fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL); + goto file_reading_error; + } + + /* + * The first information of a topologic rule + * is the number of configurations + */ + nb_rules = -1; + if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for rules count " + "or the reader encountered an unexpected EOF\n", + fname, fileline, nb_rules); + goto file_reading_error; + } + + /* Store the topologic rule informations */ + topo_rules[j].collective_id = coll_id; + topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl; + topo_rules[j].nb_rules = nb_rules; + + if(0 == nb_rules) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for configuration rules count\n", + fname, fileline, nb_rules); + continue; + } + + /* Allocate configuration rules */ + conf_rules = malloc(nb_rules * sizeof(configuration_rule_t)); + topo_rules[j].configuration_rules = conf_rules; + if(NULL == conf_rules) { + topo_rules[j].nb_rules = 0; + goto cannot_allocate; + } + + /* Iterate on configuration rules */ + for( k = 0; k < nb_rules; k++ ) { + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + + /* Get the configuration size */ + if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d " + "or the reader encountered an unexpected EOF the configuration size must be at least %d " + "and the first configuration size of a topologic level must be %d\n", + conf_size, fileline, 1, 1); + goto file_reading_error; + } + + /* + * The first information of a configuration rule + * is the number of message size rules + */ + if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count " + "or the reader encountered an unexpected EOF\n", + fname, fileline, nb_msg_size); + goto file_reading_error; + } + + /* Store configuration rule information */ + conf_rules[k].collective_id = coll_id; + conf_rules[k].topologic_level = topo_lvl; + conf_rules[k].configuration_size = conf_size; + conf_rules[k].nb_msg_size = nb_msg_size; + + if(0 == nb_msg_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count\n", + fname, fileline, nb_msg_size); + continue; + } + + /* Allocate message size rules */ + msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t)); + conf_rules[k].msg_size_rules = msg_size_rules; + if(NULL == msg_size_rules) { + conf_rules[k].nb_msg_size = 0; + goto cannot_allocate; + } + + /* Iterate on message size rules */ + for( l = 0; l < nb_msg_size; l++ ) { + char* target_comp_name = NULL; + conf_rules[k].nb_msg_size = l+1; + + /* Get the message size */ + rc = getnext_size_t(fptr, &msg_size); + if( (rc < 0) || + (0 == l && msg_size > 1)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %" PRIsize_t " is given for message size " + "or the reader encountered an unexpected EOF. " + "The first message size rule of a configuration must be 0\n", + fname, fileline, msg_size); + goto file_reading_error; + } + + /* Get the component identifier for this message size rule */ + if( getnext_string(fptr, &target_comp_name) < 0 ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: cannot read the name of a collective component\n", + fname, fileline); + goto file_reading_error; + } + component = mca_coll_han_component_name_to_id(target_comp_name); + if( (component < SELF) || (component >= COMPONENTS_COUNT) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid collective component name %s was given or the " + "reader encountered an unexpected EOF. Collective component id must be at " + "least %d and less than %d\n", + fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT); + free(target_comp_name); + goto file_reading_error; + } + + /* Store message size rule information */ + msg_size_rules[l].collective_id = coll_id; + msg_size_rules[l].topologic_level = topo_lvl; + msg_size_rules[l].configuration_size = conf_size; + msg_size_rules[l].msg_size = msg_size; + msg_size_rules[l].component = (COMPONENT_T)component; + + nb_entries++; + /* do we have the optional segment length */ + if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n"); + long seglength; + if( 0 != topo_lvl ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found segment lengths for topological collective at level != 0 " + "for collective %s component %s. These values will be ignored.\n", + fname, fileline, coll_name, target_comp_name); + } + while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) { + if( getnext_long(fptr, &seglength) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found end of file while reading the optional list " + "of segment lengths for collective %s component %s\n", + fname, fileline, coll_name, target_comp_name); + free(target_comp_name); + goto file_reading_error; + } + } + } + free(target_comp_name); + } + } + } + if( NULL != coll_name ) { + free(coll_name); + coll_name = NULL; + } + } + + if( getnext_long(fptr, &nb_coll) > 0 ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: " + "rule reading is over but reader does not seem to have reached the end of the file\n", + fname, fileline); + } + + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n", + nb_entries, fname); + + if(mca_coll_han_component.dump_dynamic_rules) { + mca_coll_han_dump_dynamic_rules(); + } + + fclose(fptr); + + check_dynamic_rules(); + return OMPI_SUCCESS; + +cannot_allocate: + /* The dynamic rules allocation failed + * Free the already allocated rules and return a failure + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "cannot allocate dynamic rules\n"); + /* Do not check free_dynamic_rules + * because we are returning OMPI_ERROR anyway */ + mca_coll_han_free_dynamic_rules(); + return OMPI_ERROR; + +file_reading_error: + if( NULL != coll_name ) { + free(coll_name); + } + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "could not fully read dynamic rules file. " + "Will use mca parameters defined rules. " + "To see error detail, please set " + "collective verbosity level over 5\n"); + if(fptr) { + fclose (fptr); + } + mca_coll_han_free_dynamic_rules(); + return OMPI_SUCCESS; +} + +void +mca_coll_han_free_dynamic_rules(void) +{ + /* Loop counters */ + int i, j, k; + + /* Loop ranges */ + int nb_coll, nb_topo, nb_conf; + + /* Aliases */ + collective_rule_t *coll_rules; + topologic_rule_t *topo_rules; + configuration_rule_t *conf_rules; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i 0) { + free(conf_rules[k].msg_size_rules); + } + } + + if(nb_conf > 0) { + free(conf_rules); + } + } + + if(nb_topo > 0) { + free(topo_rules); + } + } + + if(nb_coll > 0) { + free(coll_rules); + } + + mca_coll_han_component.dynamic_rules.nb_collectives = 0; +} + +/* + * Try to find any logical issue in dynamic rules + */ +static void check_dynamic_rules(void) +{ + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size; + size_t msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for( i = 0; i < nb_coll; i++ ) { + coll_id = coll_rules[i].collective_id; + nb_topo = coll_rules[i].nb_topologic_levels; + topo_rules = coll_rules[i].topologic_rules; + + for( j = 0; j < nb_topo; j++ ) { + topo_lvl = topo_rules[j].topologic_level; + nb_rules = topo_rules[j].nb_rules; + conf_rules = topo_rules[j].configuration_rules; + + for( k = 0; k < nb_rules; k++ ) { + conf_size = conf_rules[k].configuration_size; + nb_msg_size = conf_rules[k].nb_msg_size; + msg_size_rules = conf_rules[k].msg_size_rules; + + if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d: " + "configuration sizes %d and %d are not sorted by increasing value\n", + coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size); + } + + for( l = 0; l < nb_msg_size; l++ ) { + msg_size = msg_size_rules[l].msg_size; + component = msg_size_rules[l].component; + + if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d: " + "message sizes %" PRIsize_t " and %" PRIsize_t " are " + "not sorted by increasing value\n", + coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size); + } + + if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d " + "for message size %" PRIsize_t ": han collective component %d " + "can only be activated for topology level %d\n", + coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR); + } + } + } + } + } +} + +void mca_coll_han_dump_dynamic_rules(void) +{ + int nb_entries = 0; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(int i = 0; i < nb_coll; i++ ) { + coll_id = coll_rules[i].collective_id; + nb_topo = coll_rules[i].nb_topologic_levels; + topo_rules = coll_rules[i].topologic_rules; + + for(int j = 0; j < nb_topo; j++ ) { + topo_lvl = topo_rules[j].topologic_level; + nb_rules = topo_rules[j].nb_rules; + conf_rules = topo_rules[j].configuration_rules; + + for(int k = 0; k < nb_rules; k++ ) { + conf_size = conf_rules[k].configuration_size; + nb_msg_size = conf_rules[k].nb_msg_size; + msg_size_rules = conf_rules[k].msg_size_rules; + + for(int l = 0; l < nb_msg_size; l++ ) { + msg_size = msg_size_rules[l].msg_size; + component = msg_size_rules[l].component; + + opal_output(mca_coll_han_component.han_output, + "coll:han:dump_dynamic_rules %d collective %d (%s) " + "topology level %d (%s) configuration size %d " + "mesage size %d -> collective component %d (%s)\n", + nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size, + msg_size, component, available_components[component].component_name); + + nb_entries++; + } + } + } + } +} diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.h b/ompi/mca/coll/han/coll_han_dynamic_file.h new file mode 100644 index 0000000000..b61ba0c5d8 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic_file.h @@ -0,0 +1,110 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef MCA_COLL_HAN_DYNAMIC_FILE_H +#define MCA_COLL_HAN_DYNAMIC_FILE_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" + +/* + * ############################ + * # MCA parameters interface # + * ############################ + * An MCA parameter defined rule allows the user to choose which collective + * module will be used for a specific collective communication on a specific + * topological level. The standard name for these parameters is: + * [collective]_dynamic_[topologic_level]_module + * + * ####################### + * # Dynamic file format # + * ####################### + * File defined rules precede MCA parameter defined rule. + * To activate file reader, the MCA parameter use_dynamic_file_rules must + * be set to true. The path to the dynamic file is given by the MCA + * parameter dynamic_rules_filename. If there is any issue reading the file, + * the file is considered as invalid and only MCA parameter defined rules are + * used. If a potential logical issue is identified in the file, a + * warning is printed but the file is not considered as invalid. + * + * The file is built recursively. + * A set of rules of a type is built as follows: + * Number of rules of the set + * Rule1 + * Rule2 + * ... + * + * A rule of the level i is built as follows (excluding message size rule): + * Rule property + * Set of rules of level i+1 + * + * A message size rule is built as follows: + * Message_size Component + * + * Rule properties are (by increasing level): + * - Collective identifier: + * Defined in ompi/mca/coll/base/coll_base_functions.h. + * - Topologic level: + * Defined in coll_han_dynamic.h. It defines the communicator + * topology level. This is GLOBAL_COMMUNICATOR for the user + * communicator and the corresponding level for sub-communicators + * created by han. + * - Configuration size: + * The configuration size is the number of elements in a topology level. + * For example, if topology levels are intra-node and inter-node, it can + * be the number of MPI ranks per node or the number of nodes in the global + * communicator. For the GLOBAL_COMMUNICATOR topologic level, + * the configuration size is the communicator size. + * - Message_size Component: + * This is the message size, in bytes, of the message. Component is + * the component identifier to use for this collective on this + * communicator with this message size. Components identifier are + * defined in coll_han_dynamic.h + * + * Here is an example of a dynamic rules file: + * 2 # Collective count + * 7 # Collective identifier 1 (defined in ompi/mca/coll/base/coll_base_functions.h) + * 2 # Topologic level count + * 0 # Topologic level identifier 1 + * 1 # Configuration count + * 1 # Configuration size 1 + * 2 # Message size rules count + * 0 3 # Message size 1 and component identifier + * 128 1 # Message size 2 and component identifier + * 1 # Topologic level identifier 2 + * 1 # Configuration count + * 1 # Configuration size 1 + * 1 # Message size rules count + * 0 1 # Message size 1 and component identifier + * 3 # Collective identifier 2 + * # Set of topological rules + * + * Note that configuration size and message size rules define minimal + * values and each new rule precede every other rules. This property + * implies that this types of rules must be sorted by increasing value. + * If they are not, some rules wont be used. + * + * The counts define a stack. If the count is set to x, the reader will + * attempt to read x rules of the corresponding type. If a set of rules + * has an invalid count, this is an error and it might not be detected by + * the reader. + */ + +int mca_coll_han_init_dynamic_rules(void); +void mca_coll_han_free_dynamic_rules(void); +void mca_coll_han_dump_dynamic_rules(void); + +#endif diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c new file mode 100644 index 0000000000..14b87bde92 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +static int mca_coll_han_gather_lg_task(void *task_args); +static int mca_coll_han_gather_ug_task(void *task_args); + +/* only work with regular situation (each node has equal number of processes) */ + +static inline void +mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, bool is_mapbycore, ompi_request_t * req) +{ + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->req = req; +} + +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int w_rank, w_size; /* information about the global communicator */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + char *reorder_buf = NULL, *reorder_rbuf = NULL; + int i, err, *vranks, low_rank, low_size, *topo; + ompi_request_t *temp_request = NULL; + + /* Create the subcommunicators */ + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = OMPI_REQUEST_COLL; + temp_request->req_free = han_request_free; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; + + /* create the subcommunicators */ + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + + /* Get the 'virtual ranks' mapping correspondong to the communicators */ + vranks = han_module->cached_vranks; + /* information about sub-communicators */ + low_rank = ompi_comm_rank(low_comm); + low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", + w_rank, root, root_low_rank, root_up_rank)); + + + /* Allocate reorder buffers */ + if (w_rank == root) { + /* if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns */ + if (han_module->is_mapbycore) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Gather is_bycore: ", w_rank)); + reorder_rbuf = (char *)rbuf; + + } else { + /* Need a buffer to store unordered final result */ + ptrdiff_t rsize, rgap; + rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * w_size, + &rgap); + reorder_buf = (char *)malloc(rsize); //TODO:free + /* rgap is the size of unused space at the start of the datatype */ + reorder_rbuf = reorder_buf - rgap; + + if (MPI_IN_PLACE == sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; + ptrdiff_t src_shift = block_size * w_rank; + ptrdiff_t dest_shift = block_size * w_rank; + ompi_datatype_copy_content_same_ddt(rdtype, + (ptrdiff_t)rcount, + (char *)rbuf + dest_shift, + reorder_rbuf + src_shift); + } + } + } + + + /* Create lg task */ + mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); + /* Setup lg task arguments */ + mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t)); + mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, + rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, + low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, temp_request); + /* Init lg task */ + init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args)); + /* Issure lg task */ + issue_task(lg); + + ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); + + /* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are + * mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from + * low gather will be 0 2 4 6 and 1 3 5 7. + * So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered. + * The 3rd element (4) must be recopied at the 4th place. In general, the + * i-th element must be recopied at the place given by the i-th entry of the + * topology, which is topo[i*topolevel +1] + */ + /* reorder rbuf based on rank */ + if (w_rank == root && !han_module->is_mapbycore) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + for (i=0; iw_rank)); + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } + + /* If the process is one of the node leader */ + char *tmp_buf = NULL; + char *tmp_rbuf = NULL; + if (!t->noop) { + /* if the process is one of the node leader, allocate the intermediary + * buffer to gather on the low sub communicator */ + int low_size = ompi_comm_size(t->low_comm); + int low_rank = ompi_comm_rank(t->low_comm); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&dtype->super, + count * low_size, + &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_rbuf = tmp_buf - rgap; + if (t->w_rank == t->root) { + if (MPI_IN_PLACE == t->sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(dtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)count; + ptrdiff_t src_shift = block_size * t->w_rank; + ptrdiff_t dest_shift = block_size * low_rank; + ompi_datatype_copy_content_same_ddt(dtype, + (ptrdiff_t)count, + tmp_rbuf + dest_shift, + (char *)t->rbuf + src_shift); + } + } + } + + /* Low level (usually intra-node or shared memory) node gather */ + t->low_comm->c_coll->coll_gather((char *)t->sbuf, + count, + dtype, + tmp_rbuf, + count, + dtype, + t->root_low_rank, + t->low_comm, + t->low_comm->c_coll->coll_gather_module); + + /* Prepare up comm gather */ + t->sbuf = tmp_rbuf; + t->sbuf_inter_free = tmp_buf; + + /* Create ug (upper level all-gather) task */ + mca_coll_task_t *ug = t->cur_task; + /* Init ug task */ + init_task(ug, mca_coll_han_gather_ug_task, (void *) t); + /* Issure ug task */ + issue_task(ug); + + return OMPI_SUCCESS; +} + +/* ug: upper level (intra-node) gather task */ +int mca_coll_han_gather_ug_task(void *task_args) +{ + mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args; + OBJ_RELEASE(t->cur_task); + + if (t->noop) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Han Gather: ug noop\n", t->w_rank)); + } else { + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } + + + int low_size = ompi_comm_size(t->low_comm); + /* inter node gather */ + t->up_comm->c_coll->coll_gather((char *)t->sbuf, + count*low_size, + dtype, + (char *)t->rbuf, + count*low_size, + dtype, + t->root_up_rank, + t->up_comm, + t->up_comm->c_coll->coll_gather_module); + + if (t->sbuf_inter_free != NULL) { + free(t->sbuf_inter_free); + t->sbuf_inter_free = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Han Gather: ug gather finish\n", t->w_rank)); + } + ompi_request_t *temp_req = t->req; + free(t); + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; +} + +/* only work with regular situation (each node has equal number of processes) */ +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int *topo, w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + ompi_datatype_t *dtype; + size_t count; + + if (w_rank == root) { + dtype = rdtype; + count = rcount; + } else { + dtype = sdtype; + count = scount; + } + + + /* Get the 'virtual ranks' mapping corresponding to the communicators */ + int *vranks = han_module->cached_vranks; + /* information about sub-communicators */ + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + /* allocate buffer to store unordered result on root + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns */ + char *reorder_buf = NULL; // allocated memory + char *reorder_buf_start = NULL; // start of the data + if (w_rank == root) { + if (han_module->is_mapbycore) { + reorder_buf_start = (char *)rbuf; + } else { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Gather needs reordering: ", w_rank)); + ptrdiff_t rgap = 0; + ptrdiff_t rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * w_size, + &rgap); + reorder_buf = (char *)malloc(rsize); + /* rgap is the size of unused space at the start of the datatype */ + reorder_buf_start = reorder_buf - rgap; + } + + } + + /* allocate the intermediary buffer + * to gather on leaders on the low sub communicator */ + char *tmp_buf = NULL; // allocated memory + char *tmp_buf_start = NULL; // start of the data + if (low_rank == root_low_rank) { + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&dtype->super, + count * low_size, + &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + } + + /* 1. low gather on nodes leaders */ + low_comm->c_coll->coll_gather((char *)sbuf, + count, + dtype, + tmp_buf_start, + count, + dtype, + root_low_rank, + low_comm, + low_comm->c_coll->coll_gather_module); + + /* 2. upper gather (inter-node) between node leaders */ + if (low_rank == root_low_rank) { + up_comm->c_coll->coll_gather((char *)tmp_buf_start, + count*low_size, + dtype, + (char *)reorder_buf_start, + count*low_size, + dtype, + root_up_rank, + up_comm, + up_comm->c_coll->coll_gather_module); + + if (tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Future Gather: ug gather finish\n", w_rank)); + } + + /* 3. reorder data on root into rbuf + * if ranks are not mapped in topological order, data needs to be reordered + * (see reorder_gather) + */ + if (w_rank == root && !han_module->is_mapbycore) { + ompi_coll_han_reorder_gather(reorder_buf_start, + rbuf, rcount, rdtype, + comm, topo); + free(reorder_buf); + } + + return OMPI_SUCCESS; +} + +/* Reorder after gather operation, for unordered ranks + * + * Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are + * mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from + * low gather will be 0 2 4 6 and 1 3 5 7. + * So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered. + * The 3rd element (4) must be recopied at the 4th place. In general, the + * i-th element must be recopied at the place given by the i-th entry of the + * topology, which is topo[i*topolevel +1] + */ +void +ompi_coll_han_reorder_gather(const void *sbuf, + void *rbuf, int rcount, + struct ompi_datatype_t *dtype, + struct ompi_communicator_t *comm, + int * topo) +{ + int i, topolevel = 2; // always 2 levels in topo + int w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + ptrdiff_t rextent; + ompi_datatype_type_extent(dtype, &rextent); + for ( i = 0; i < w_size; i++ ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future reorder from %d to %d\n", + w_rank, + i * topolevel + 1, + topo[i * topolevel + 1])); + ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; + ptrdiff_t src_shift = block_size * i; + ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * topolevel + 1]; + ompi_datatype_copy_content_same_ddt(dtype, + (ptrdiff_t)rcount, + (char *)rbuf + dest_shift, + (char *)sbuf + src_shift); + } +} diff --git a/ompi/mca/coll/han/coll_han_module.c b/ompi/mca/coll/han/coll_han_module.c new file mode 100644 index 0000000000..1a3a7e5c66 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_module.c @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" + + +/* + * Local functions + */ +static int han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); +static int mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); + +#define CLEAN_PREV_COLL(HANDLE, NAME) \ + do { \ + (HANDLE)->fallback.NAME.NAME = NULL; \ + (HANDLE)->fallback.NAME.module = NULL; \ + } while (0) + +/* + * Module constructor + */ +static void han_module_clear(mca_coll_han_module_t *han_module) +{ + CLEAN_PREV_COLL(han_module, allgather); + CLEAN_PREV_COLL(han_module, allgatherv); + CLEAN_PREV_COLL(han_module, allreduce); + CLEAN_PREV_COLL(han_module, bcast); + CLEAN_PREV_COLL(han_module, reduce); + CLEAN_PREV_COLL(han_module, gather); + CLEAN_PREV_COLL(han_module, scatter); + + han_module->reproducible_reduce = NULL; + han_module->reproducible_reduce_module = NULL; + han_module->reproducible_allreduce = NULL; + han_module->reproducible_allreduce_module = NULL; +} + +static void mca_coll_han_module_construct(mca_coll_han_module_t * module) +{ + int i; + + module->enabled = true; + module->super.coll_module_disable = mca_coll_han_module_disable; + module->cached_low_comms = NULL; + module->cached_up_comms = NULL; + module->cached_vranks = NULL; + module->cached_topo = NULL; + module->is_mapbycore = false; + module->storage_initialized = false; + for( i = 0; i < NB_TOPO_LVL; i++ ) { + module->sub_comm[i] = NULL; + } + for( i = SELF; i < COMPONENTS_COUNT; i++ ) { + module->modules_storage.modules[i].module_handler = NULL; + } + + module->dynamic_errors = 0; + + han_module_clear(module); +} + + +#define OBJ_RELEASE_IF_NOT_NULL(obj) \ + do { \ + if (NULL != (obj)) { \ + OBJ_RELEASE(obj); \ + } \ + } while (0) + +/* + * Module destructor + */ +static void +mca_coll_han_module_destruct(mca_coll_han_module_t * module) +{ + int i; + + module->enabled = false; + if (module->cached_low_comms != NULL) { + for (i = 0; i < COLL_HAN_LOW_MODULES; i++) { + ompi_comm_free(&(module->cached_low_comms[i])); + module->cached_low_comms[i] = NULL; + } + free(module->cached_low_comms); + module->cached_low_comms = NULL; + } + if (module->cached_up_comms != NULL) { + for (i = 0; i < COLL_HAN_UP_MODULES; i++) { + ompi_comm_free(&(module->cached_up_comms[i])); + module->cached_up_comms[i] = NULL; + } + free(module->cached_up_comms); + module->cached_up_comms = NULL; + } + if (module->cached_vranks != NULL) { + free(module->cached_vranks); + module->cached_vranks = NULL; + } + if (module->cached_topo != NULL) { + free(module->cached_topo); + module->cached_topo = NULL; + } + for(i=0 ; isub_comm[i]) { + ompi_comm_free(&(module->sub_comm[i])); + } + } + + OBJ_RELEASE_IF_NOT_NULL(module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_scatter_module); + + han_module_clear(module); +} + +OBJ_CLASS_INSTANCE(mca_coll_han_module_t, + mca_coll_base_module_t, + mca_coll_han_module_construct, + mca_coll_han_module_destruct); + +/* + * Initial query function that is invoked during MPI_INIT, allowing + * this component to disqualify itself if it doesn't support the + * required level of thread support. This function is invoked exactly + * once. + */ +int mca_coll_han_init_query(bool enable_progress_threads, + bool enable_mpi_threads) +{ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:init_query: pick me! pick me!"); + return OMPI_SUCCESS; +} + + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +mca_coll_base_module_t * +mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) +{ + int flag; + char info_val[OPAL_MAX_INFO_VAL+1]; + mca_coll_han_module_t *han_module; + + /* + * If we're intercomm, or if there's only one process in the communicator + */ + if (OMPI_COMM_IS_INTER(comm)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): intercomm; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + if (1 == ompi_comm_size(comm)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm is too small; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + if( !ompi_group_have_remote_peers(comm->c_local_group) ) { + /* The group only contains local processes. Disable HAN for now */ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm has only local processes; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + /* Get the priority level attached to this module. If priority is less + * than or equal to 0, then the module is unavailable. */ + *priority = mca_coll_han_component.han_priority; + if (mca_coll_han_component.han_priority <= 0) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): priority too low; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + han_module = OBJ_NEW(mca_coll_han_module_t); + if (NULL == han_module) { + return NULL; + } + + /* All is good -- return a module */ + han_module->topologic_level = GLOBAL_COMMUNICATOR; + + if (NULL != comm->super.s_info) { + /* Get the info value disaqualifying coll components */ + opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level", + sizeof(info_val), info_val, &flag); + + if (flag) { + if (0 == strcmp(info_val, "INTER_NODE")) { + han_module->topologic_level = INTER_NODE; + } else { + han_module->topologic_level = INTRA_NODE; + } + } + } + + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatterv = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; + + if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { + /* We are on the global communicator, return topological algorithms */ + han_module->super.coll_allgatherv = NULL; + } else { + /* We are on a topologic sub-communicator, return only the selector */ + han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; + } + + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): pick me! pick me!", + comm->c_contextid, comm->c_name); + return &(han_module->super); +} + + +/* + * In this macro, the following variables are supposed to have been declared + * in the caller: + * . ompi_communicator_t *comm + * . mca_coll_han_module_t *han_module + */ +#define HAN_SAVE_PREV_COLL_API(__api) \ + do { \ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + goto handle_error; \ + } \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ + OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ + } while(0) + +/* + * Init module on the communicator + */ +static int +han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; + + HAN_SAVE_PREV_COLL_API(allgather); + HAN_SAVE_PREV_COLL_API(allgatherv); + HAN_SAVE_PREV_COLL_API(allreduce); + HAN_SAVE_PREV_COLL_API(bcast); + HAN_SAVE_PREV_COLL_API(gather); + HAN_SAVE_PREV_COLL_API(reduce); + HAN_SAVE_PREV_COLL_API(scatter); + + /* set reproducible algos */ + mca_coll_han_reduce_reproducible_decision(comm, module); + mca_coll_han_allreduce_reproducible_decision(comm, module); + + return OMPI_SUCCESS; + +handle_error: + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + return OMPI_ERROR; +} + +/* + * Module disable + */ +static int +mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; + + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + han_module_clear(han_module); + + return OMPI_SUCCESS; +} + + +/* + * Free the han request + */ +int han_request_free(ompi_request_t ** request) +{ + (*request)->req_state = OMPI_REQUEST_INVALID; + OBJ_RELEASE(*request); + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c new file mode 100644 index 0000000000..03968b6f47 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +static int mca_coll_han_reduce_t0_task(void *task_args); +static int mca_coll_han_reduce_t1_task(void *task_args); + +static inline void +mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, + int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop, bool is_tmp_rbuf) +{ + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->is_tmp_rbuf = is_tmp_rbuf; +} + +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: + * lb: low level (shared-memory or intra-node) reduce. +* ub: upper level (inter-node) reduce + * Hence, in each iteration, there is a combination of collective operations which is called a task. + * | seg 0 | seg 1 | seg 2 | seg 3 | + * iter 0 | lr | | | | task: t0, contains lr + * iter 1 | ur | lr | | | task: t1, contains ur and lr + * iter 2 | | ur | lr | | task: t1, contains ur and lr + * iter 3 | | | ur | lr | task: t1, contains ur and lr + * iter 4 | | | | ur | task: t1, contains ur + */ +int +mca_coll_han_reduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t* op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + ptrdiff_t extent, lb; + int seg_count = count, w_rank; + size_t dtype_size; + + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this operation. Fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all modules */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + + ompi_datatype_get_extent(dtype, &lb, &extent); + w_rank = ompi_comm_rank(comm); + ompi_datatype_type_size(dtype, &dtype_size); + + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, dtype_size, + seg_count); + + int num_segments = (count + seg_count - 1) / seg_count; + OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, + "In HAN seg_count %d count %d num_seg %d\n", + seg_count, count, num_segments)); + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); + + int root_low_rank; + int root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, + root_up_rank)); + + void *tmp_rbuf = rbuf; + void *tmp_rbuf_to_free = NULL; + if (low_rank == root_low_rank && root_up_rank != up_rank) { + /* allocate 2 segments on node leaders that are not the global root */ + tmp_rbuf = malloc(2*extent*seg_count); + tmp_rbuf_to_free = tmp_rbuf; + } + + /* Create t0 tasks for the first segment */ + mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); + /* Setup up t0 task arguments */ + mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t)); + mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) tmp_rbuf, seg_count, dtype, + op, root_up_rank, root_low_rank, up_comm, low_comm, + num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, + low_rank != root_low_rank, (NULL != tmp_rbuf_to_free)); + /* Init the first task */ + init_task(t0, mca_coll_han_reduce_t0_task, (void *) t); + issue_task(t0); + + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); + issue_task(t1); + + while (t->cur_seg <= t->num_segments - 2) { + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + t->sbuf = (char *) t->sbuf + extent * t->seg_count; + if (up_rank == root_up_rank) { + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + } + t->cur_seg = t->cur_seg + 1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); + issue_task(t1); + } + + free(t); + free(tmp_rbuf_to_free); + + return OMPI_SUCCESS; + + prev_reduce_intra: + return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, + han_module->previous_reduce_module); +} + +/* t0 task: issue and wait for the low level reduce of segment 0 */ +int mca_coll_han_reduce_t0_task(void *task_args) +{ + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + return OMPI_SUCCESS; +} + +/* t1 task */ +int mca_coll_han_reduce_t1_task(void *task_args) { + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + int cur_seg = t->cur_seg; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *ireduce_req = NULL; + if (!t->noop) { + int tmp_count = t->seg_count; + if (cur_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + int up_rank = ompi_comm_rank(t->up_comm); + /* ur of cur_seg */ + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, tmp_count, t->dtype, + t->op, t->root_up_rank, t->up_comm, &ireduce_req, + t->up_comm->c_coll->coll_ireduce_module); + } else { + /* this is a node leader that is not root so alternate between the two allocated segments */ + char *tmp_sbuf = (char*)t->rbuf + (cur_seg % 2)*(extent * t->seg_count); + t->up_comm->c_coll->coll_ireduce(tmp_sbuf, NULL, tmp_count, + t->dtype, t->op, t->root_up_rank, t->up_comm, + &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); + } + } + /* lr of cur_seg+1 */ + int next_seg = cur_seg + 1; + if (next_seg <= t->num_segments - 1) { + int tmp_count = t->seg_count; + char *tmp_rbuf = NULL; + if (next_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + if (t->is_tmp_rbuf) { + tmp_rbuf = (char*)t->rbuf + (next_seg % 2)*(extent * t->seg_count); + } else if (NULL != t->rbuf) { + tmp_rbuf = (char*)t->rbuf + extent * t->seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, + (char *) tmp_rbuf, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + + } + if (!t->noop && ireduce_req) { + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); + } + + return OMPI_SUCCESS; +} + +/* In case of non regular situation (imbalanced number of processes per nodes), + * a fallback is made on the next component that provides a reduce in priority order */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank; /* information about the global communicator */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + int ret; + int *vranks, low_rank, low_size; + ptrdiff_t rsize, rgap = 0; + void * tmp_buf; + + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this operation. Fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; + + /* Get the 'virtual ranks' mapping corresponding to the communicators */ + vranks = han_module->cached_vranks; + w_rank = ompi_comm_rank(comm); + low_rank = ompi_comm_rank(low_comm); + + low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + if (root_low_rank == low_rank && w_rank != root) { + rsize = opal_datatype_span(&dtype->super, (int64_t)count, &rgap); + tmp_buf = malloc(rsize); + if (NULL == tmp_buf) { + return OMPI_ERROR; + } + } else { + /* global root rbuf is valid, local non-root do not need buffers */ + tmp_buf = rbuf; + } + /* No need to handle MPI_IN_PLACE: only the global root may ask for it and + * it is ok to use it for intermediary reduces since it is also a local root*/ + + /* Low_comm reduce */ + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)tmp_buf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){ + if (root_low_rank == low_rank && w_rank != root){ + free(tmp_buf); + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: low comm reduce failed. " + "Falling back to another component\n")); + goto prev_reduce_intra; + } + + /* Up_comm reduce */ + if (root_low_rank == low_rank ){ + if(w_rank != root){ + ret = up_comm->c_coll->coll_reduce((char *)tmp_buf, NULL, + count, dtype, op, root_up_rank, + up_comm, up_comm->c_coll->coll_reduce_module); + free(tmp_buf); + } else { + /* Take advantage of any optimisation made for IN_PLACE + * communcations */ + ret = up_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)tmp_buf, + count, dtype, op, root_up_rank, + up_comm, up_comm->c_coll->coll_reduce_module); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: low comm reduce failed.\n")); + return ret; + } + + } + return OMPI_SUCCESS; + + prev_reduce_intra: + return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, han_module->previous_reduce_module); +} + + +/* Find a fallback on reproducible algorithm + * use tuned or basic or if impossible whatever available + */ +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* populate previous modules_storage*/ + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* try availability of reproducible modules */ + int fallbacks[] = {TUNED, BASIC}; + int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks); + int i; + for (i=0; imodules_storage.modules[fallback].module_handler; + if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { + if (0 == w_rank) { + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:reduce_reproducible: " + "fallback on %s\n", + available_components[fallback].component_name); + } + han_module->reproducible_reduce_module = fallback_module; + han_module->reproducible_reduce = fallback_module->coll_reduce; + return OMPI_SUCCESS; + } + } + /* fallback of the fallback */ + if (0 == w_rank) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:reduce_reproducible_decision: " + "no reproducible fallback\n"); + } + han_module->reproducible_reduce_module = + han_module->previous_reduce_module; + han_module->reproducible_reduce = han_module->previous_reduce; + return OMPI_SUCCESS; +} + + +/* Fallback on reproducible algorithm */ +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + return han_module->reproducible_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->reproducible_reduce_module); +} diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c new file mode 100644 index 0000000000..c52cc1911a --- /dev/null +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +static int mca_coll_han_scatter_us_task(void *task_args); +static int mca_coll_han_scatter_ls_task(void *task_args); + +/* Only work with regular situation (each node has equal number of processes) */ + +static inline void +mca_coll_han_set_scatter_args(mca_coll_han_scatter_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + void *sbuf_reorder_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) +{ + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->sbuf_reorder_free = sbuf_reorder_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->req = req; +} + +int +mca_coll_han_scatter_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int i, j, w_rank, w_size; + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + int* topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatter); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); + } + + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_size = ompi_comm_size(up_comm); + + /* Set up request */ + ompi_request_t *temp_request = OBJ_NEW(ompi_request_t); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = OMPI_REQUEST_COLL; + temp_request->req_free = han_request_free; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; + + int root_low_rank; + int root_up_rank; + + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, + root, root_low_rank, root_up_rank)); + + /* Reorder sbuf based on rank. + * Suppose, message is 0 1 2 3 4 5 6 7 + * and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7), + * so the message needs to be reordered to 0 2 4 6 1 3 5 7 + */ + char *reorder_buf = NULL; + char *reorder_sbuf = NULL; + + if (w_rank == root) { + /* If the processes are mapped-by core, no need to reorder */ + if (han_module->is_mapbycore) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Scatter is_bycore: ", w_rank)); + reorder_sbuf = (char *) sbuf; + } else { + ptrdiff_t ssize, sgap = 0, sextent; + ompi_datatype_type_extent(sdtype, &sextent); + ssize = opal_datatype_span(&sdtype->super, (int64_t) scount * w_size, &sgap); + reorder_buf = (char *) malloc(ssize); + reorder_sbuf = reorder_buf - sgap; + for (i = 0; i < up_size; i++) { + for (j = 0; j < low_size; j++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Scatter copy from %d %d\n", w_rank, + (i * low_size + j) * 2 + 1, + topo[(i * low_size + j) * 2 + 1])); + ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t) scount, + reorder_sbuf + sextent * (i * low_size + + j) * + (ptrdiff_t) scount, + (char *) sbuf + + sextent * + (ptrdiff_t) topo[(i * low_size + j) * 2 + + 1] * (ptrdiff_t) scount); + } + } + } + } + + + void *dest_buf = rbuf; + int dest_count = rcount; + ompi_datatype_t *dest_dtype = rdtype; + if (MPI_IN_PLACE == rbuf) { + dest_buf = (void*)sbuf; + dest_count = scount; + dest_dtype = sdtype; + } + + /* Create us task */ + mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); + /* Setup us task arguments */ + mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t)); + mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, + (char *) dest_buf, dest_count, dest_dtype, root, root_up_rank, root_low_rank, + up_comm, low_comm, w_rank, low_rank != root_low_rank, + temp_request); + /* Init us task */ + init_task(us, mca_coll_han_scatter_us_task, (void *) (us_args)); + /* Issure us task */ + issue_task(us); + + ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); + return OMPI_SUCCESS; + +} + +/* us: upper level (intra-node) scatter task */ +int mca_coll_han_scatter_us_task(void *task_args) +{ + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; + + if (t->noop) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", + t->w_rank)); + } else { + size_t count; + ompi_datatype_t *dtype; + if (t->w_rank == t->root) { + dtype = t->sdtype; + count = t->scount; + } else { + dtype = t->rdtype; + count = t->rcount; + } + int low_size = ompi_comm_size(t->low_comm); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&dtype->super, (int64_t) count * low_size, &rgap); + char *tmp_buf = (char *) malloc(rsize); + char *tmp_rbuf = tmp_buf - rgap; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Han Scatter: us scatter\n", t->w_rank)); + /* Inter node scatter */ + t->up_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount * low_size, t->sdtype, + tmp_rbuf, t->rcount * low_size, t->rdtype, t->root_up_rank, + t->up_comm, t->up_comm->c_coll->coll_scatter_module); + t->sbuf = tmp_rbuf; + t->sbuf_inter_free = tmp_buf; + } + + if (t->sbuf_reorder_free != NULL && t->root == t->w_rank) { + free(t->sbuf_reorder_free); + t->sbuf_reorder_free = NULL; + } + /* Create ls tasks for the current union segment */ + mca_coll_task_t *ls = t->cur_task; + /* Init ls task */ + init_task(ls, mca_coll_han_scatter_ls_task, (void *) t); + /* Issure ls task */ + issue_task(ls); + + return OMPI_SUCCESS; +} + +/* ls: lower level (shared memory or intra-node) scatter task */ +int mca_coll_han_scatter_ls_task(void *task_args) +{ + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n", + t->w_rank)); + OBJ_RELEASE(t->cur_task); + + t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf, + t->rcount, t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_scatter_module); + + if (t->sbuf_inter_free != NULL && t->noop != true) { + free(t->sbuf_inter_free); + t->sbuf_inter_free = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls finish\n", + t->w_rank)); + ompi_request_t *temp_req = t->req; + free(t); + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c new file mode 100644 index 0000000000..bf5b4df523 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Warning: this is not for the faint of heart -- don't even bother + * reading this source code if you don't have a strong understanding + * of nested data structures and pointer math (remember that + * associativity and order of C operations is *critical* in terms of + * pointer math!). + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" + +#define HAN_SUBCOM_SAVE_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (FALLBACKS).COLL.COLL = (COMM)->c_coll->coll_ ## COLL; \ + (FALLBACKS).COLL.module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + } while(0) + +#define HAN_SUBCOM_LOAD_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (COMM)->c_coll->coll_ ## COLL = (FALLBACKS).COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (FALLBACKS).COLL.module; \ + } while(0) + +/* + * Routine that creates the local hierarchical sub-communicators + * Called each time a collective is called. + * comm: input communicator of the collective + */ +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int low_rank, low_size, up_rank, w_rank, w_size; + ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); + ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); + mca_coll_han_collectives_fallback_t fallbacks; + int vrank, *vranks; + opal_info_t comm_info; + + /* The sub communicators have already been created */ + if (han_module->enabled && NULL != han_module->sub_comm[INTRA_NODE] + && NULL != han_module->sub_comm[INTER_NODE] + && NULL != han_module->cached_vranks) { + return OMPI_SUCCESS; + } + + /* + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); + + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } + + OBJ_CONSTRUCT(&comm_info, opal_info_t); + + /* Create topological sub-communicators */ + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + + /* + * This sub-communicator contains the ranks that share my node. + */ + opal_info_set(&comm_info, "ompi_comm_coll_preference", "han"); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, low_comm); + + /* + * Get my local rank and the local size + */ + low_size = ompi_comm_size(*low_comm); + low_rank = ompi_comm_rank(*low_comm); + + /* + * This sub-communicator contains one process per node: processes with the + * same intra-node rank id share such a sub-communicator + */ + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false); + + up_rank = ompi_comm_rank(*up_comm); + + /* + * Set my virtual rank number. + * my rank # = * + * + + * WARNING: this formula works only if the ranks are perfectly spread over + * the nodes + * TODO: find a better way of doing + */ + vrank = low_size * up_rank + low_rank; + vranks = (int *)malloc(sizeof(int) * w_size); + /* + * gather vrank from each process so every process will know other processes + * vrank + */ + comm->c_coll->coll_allgather(&vrank, + 1, + MPI_INT, + vranks, + 1, + MPI_INT, + comm, + comm->c_coll->coll_allgather_module); + + /* + * Set the cached info + */ + han_module->cached_vranks = vranks; + + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; +} + +/* + * Routine that creates the local hierarchical sub-communicators + * Called each time a collective is called. + * comm: input communicator of the collective + */ +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int low_rank, low_size, up_rank, w_rank, w_size; + mca_coll_han_collectives_fallback_t fallbacks; + ompi_communicator_t **low_comms; + ompi_communicator_t **up_comms; + int vrank, *vranks; + opal_info_t comm_info; + + /* use cached communicators if possible */ + if (han_module->enabled && han_module->cached_low_comms != NULL && + han_module->cached_up_comms != NULL && + han_module->cached_vranks != NULL) { + return OMPI_SUCCESS; + } + + /* + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); + + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } + + /* create communicators if there is no cached communicator */ + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * + sizeof(struct ompi_communicator_t *)); + up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * + sizeof(struct ompi_communicator_t *)); + + OBJ_CONSTRUCT(&comm_info, opal_info_t); + + /* + * Upgrade sm module priority to set up low_comms[0] with sm module + * This sub-communicator contains the ranks that share my node. + */ + opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[0])); + + /* + * Get my local rank and the local size + */ + low_size = ompi_comm_size(low_comms[0]); + low_rank = ompi_comm_rank(low_comms[0]); + + /* + * Upgrade shared module priority to set up low_comms[1] with shared module + * This sub-communicator contains the ranks that share my node. + */ + opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[1])); + + /* + * Upgrade libnbc module priority to set up up_comms[0] with libnbc module + * This sub-communicator contains one process per node: processes with the + * same intra-node rank id share such a sub-communicator + */ + opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false); + + up_rank = ompi_comm_rank(up_comms[0]); + + /* + * Upgrade adapt module priority to set up up_comms[0] with adapt module + * This sub-communicator contains one process per node. + */ + opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false); + + /* + * Set my virtual rank number. + * my rank # = * + * + + * WARNING: this formula works only if the ranks are perfectly spread over + * the nodes + * TODO: find a better way of doing + */ + vrank = low_size * up_rank + low_rank; + vranks = (int *)malloc(sizeof(int) * w_size); + /* + * gather vrank from each process so every process will know other processes + * vrank + */ + comm->c_coll->coll_allgather(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm, + comm->c_coll->coll_allgather_module); + + /* + * Set the cached info + */ + han_module->cached_low_comms = low_comms; + han_module->cached_up_comms = up_comms; + han_module->cached_vranks = vranks; + + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; +} + + diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c new file mode 100644 index 0000000000..e25e37207e --- /dev/null +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Warning: this is not for the faint of heart -- don't even bother + * reading this source code if you don't have a strong understanding + * of nested data structures and pointer math (remember that + * associativity and order of C operations is *critical* in terms of + * pointer math!). + */ + +#include "ompi_config.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ + + +#include "mpi.h" +#include "coll_han.h" + + +#if OPAL_ENABLE_DEBUG +static void +mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) +{ + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + if (rank == 0) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank)); + for( int i = 0; i < size*num_topo_level; i++ ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); + } +} +#endif /* OPAL_ENABLE_DEBUG */ + +/** + * Topology initialization phase + * Called each time a collective that needs buffer reordering is called + * + * @param num_topo_level (IN) Number of the topological levels + */ +int* +mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) +{ + if ( NULL != han_module->cached_topo ) { + return han_module->cached_topo; + } + + ompi_communicator_t *up_comm, *low_comm; + ompi_request_t *request = MPI_REQUEST_NULL; + int *my_low_rank_map = NULL; + int *ranks_map = NULL; + + int size = ompi_comm_size(comm); + + if (NULL != han_module->cached_up_comms) { + up_comm = han_module->cached_up_comms[0]; + low_comm = han_module->cached_low_comms[0]; + } else { + up_comm = han_module->sub_comm[INTER_NODE]; + low_comm = han_module->sub_comm[INTRA_NODE]; + } + assert(up_comm != NULL && low_comm != NULL); + + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + + int *topo = (int *)malloc(sizeof(int) * size * num_topo_level); + int is_imbalanced = 1; + int ranks_consecutive = 1; + + /* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */ + if (0 == low_rank) { + my_low_rank_map = malloc(sizeof(int)*low_size); + for (int i = 0; i < low_size; ++i) { + topo[i] = i; + } + ompi_group_translate_ranks(low_comm->c_local_group, low_size, topo, + comm->c_local_group, my_low_rank_map); + /* check if ranks are consecutive */ + int rank = my_low_rank_map[0] + 1; + for (int i = 1; i < low_size; ++i, ++rank) { + if (my_low_rank_map[i] != rank) { + ranks_consecutive = 0; + break; + } + } + + int reduce_vals[] = {ranks_consecutive, -ranks_consecutive, low_size, -low_size}; + + up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4, + MPI_INT, MPI_MAX, up_comm, + up_comm->c_coll->coll_allreduce_module); + + /* is the distribution of processes balanced per node? */ + is_imbalanced = (reduce_vals[2] == -reduce_vals[3]) ? 0 : 1; + ranks_consecutive = (reduce_vals[0] == -reduce_vals[1]) ? 1 : 0; + + if ( !ranks_consecutive && !is_imbalanced ) { + /* kick off up_comm allgather to collect non-consecutive rank information at node leaders */ + ranks_map = malloc(sizeof(int)*size); + up_comm->c_coll->coll_iallgather(my_low_rank_map, low_size, MPI_INT, + ranks_map, low_size, MPI_INT, up_comm, &request, + up_comm->c_coll->coll_iallgather_module); + } + } + + + /* broadcast balanced and consecutive properties from node leaders to remaining ranks */ + int bcast_vals[] = {is_imbalanced, ranks_consecutive}; + low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + is_imbalanced = bcast_vals[0]; + ranks_consecutive = bcast_vals[1]; + + /* error out if the rank distribution is not balanced */ + if (is_imbalanced) { + assert(MPI_REQUEST_NULL == request); + han_module->are_ppn_imbalanced = true; + free(topo); + if( NULL != my_low_rank_map ) free(my_low_rank_map); + if( NULL != ranks_map ) free(ranks_map); + return NULL; + } + + han_module->are_ppn_imbalanced = false; + + if (ranks_consecutive) { + /* fast-path: all ranks are consecutive and balanced so fill topology locally */ + for (int i = 0; i < size; ++i) { + topo[2*i] = (i/low_size); // node leader is node ID + topo[2*i+1] = i; + } + han_module->is_mapbycore = true; + } else { + /* + * Slow path: gather global-to-node-local rank mappings at node leaders + * + * The topology will contain a mapping from global consecutive positions + * to ranks in the communicator. + * + * ex: 4 ranks executing on 2 nodes, mapped by node + * ranks 0 and 2 on hid0 + * ranks 1 and 3 on hid1 + * On entry the topo array looks like + * hid0 0 hid1 1 hid0 2 hid1 3 + * After the sort: + * hid0 0 hid0 2 hid1 1 hid1 3 + */ + if (0 == low_rank) { + ompi_request_wait(&request, MPI_STATUS_IGNORE); + /* fill topology */ + for (int i = 0; i < size; ++i) { + topo[2*i] = ranks_map[(i/low_size)*low_size]; // node leader is node ID + topo[2*i+1] = ranks_map[i]; + } + free(ranks_map); + } + } + + /* broadcast topology from node leaders to remaining ranks */ + low_comm->c_coll->coll_bcast(topo, num_topo_level*size, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + free(my_low_rank_map); + han_module->cached_topo = topo; +#if OPAL_ENABLE_DEBUG + mca_coll_han_topo_print(topo, comm, num_topo_level); +#endif /* OPAL_ENABLE_DEBUG */ + + return topo; +} + diff --git a/ompi/mca/coll/han/coll_han_trigger.c b/ompi/mca/coll/han/coll_han_trigger.c new file mode 100644 index 0000000000..87c8ed9597 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_trigger.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han_trigger.h" + +static void mca_coll_task_constructor(mca_coll_task_t * t) +{ + t->func_ptr = NULL; + t->func_args = NULL; +} + +static void mca_coll_task_destructor(mca_coll_task_t * t) +{ + t->func_ptr = NULL; + t->func_args = NULL; +} + +OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, + mca_coll_task_destructor); diff --git a/ompi/mca/coll/han/coll_han_trigger.h b/ompi/mca/coll/han/coll_han_trigger.h new file mode 100644 index 0000000000..413e393be6 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_trigger.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H +#define MCA_COLL_HAN_TRIGGER_EXPORT_H + +#include "ompi/communicator/communicator.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype.h" + + +typedef int (*task_func_ptr) (void *); + +struct mca_coll_task_s { + opal_object_t super; + task_func_ptr func_ptr; + void *func_args; +}; + +typedef struct mca_coll_task_s mca_coll_task_t; + +OBJ_CLASS_DECLARATION(mca_coll_task_t); + +/* Init task */ +static inline int +init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args) +{ + OBJ_CONSTRUCT(t, mca_coll_task_t); + t->func_ptr = func_ptr; + t->func_args = func_args; + return OMPI_SUCCESS; +} + +/* Issue the task */ +static inline int +issue_task(mca_coll_task_t * t) +{ + return t->func_ptr(t->func_args); +} + +#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 781215251e..25e9c77946 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -176,7 +176,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority) if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name); - return NULL; + return NULL; } /* Get the priority level attached to this module. If priority is less diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index cc73fcf835..637122185e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -1446,7 +1446,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount, communicator_size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - if (rank == root) { + /* Determine block size */ + if ( (rank == root) || (MPI_IN_PLACE == sbuf) ) { ompi_datatype_type_size(rdtype, &dsize); total_dsize = dsize * (ptrdiff_t)rcount; } else { diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index 7ba85078fd..a259c789ac 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -29,19 +29,19 @@ /* need to include our own topo prototypes so we can malloc data on the comm correctly */ #include "ompi/mca/coll/base/coll_base_topo.h" +/* need file reading function */ +#include "ompi/mca/coll/base/coll_base_util.h" + /* also need the dynamic rule structures */ #include "coll_tuned_dynamic_rules.h" /* and our own prototypes */ #include "coll_tuned_dynamic_file.h" - -#define MYEOF -999 - -static long getnext (FILE *fptr); /* local function */ - static int fileline=0; /* used for verbose error messages */ +#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) + /* * Reads a rule file called fname * Builds the algorithm rule table for a max of n_collectives @@ -56,9 +56,8 @@ static int fileline=0; /* used for verbose error messages */ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) { + long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS; FILE *fptr = (FILE*) NULL; - int X, CI, NCS, CS, ALG, NMS, FANINOUT; - long MS, SS; int x, ncs, nms; ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ @@ -101,45 +100,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** goto on_file_error; } - X = (int)getnext(fptr); - if (X<0) { + if( (getnext(fptr, &X) < 0) || (X < 0) ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); goto on_file_error; } if (X>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); goto on_file_error; } for (x=0;x=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); goto on_file_error; } if (alg_rules[CI].alg_rule_id != CI) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI)); alg_p = &alg_rules[CI]; alg_p->alg_rule_id = CI; alg_p->n_com_sizes = 0; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; - NCS = (int)getnext (fptr); - if (NCS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline)); + if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI)); alg_p->n_com_sizes = NCS; alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); if (NULL == alg_p->com_rules) { @@ -151,20 +147,18 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** com_p = &(alg_p->com_rules[ncs]); - CS = (int)getnext (fptr); - if (CS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } com_p->mpi_comsize = CS; - NMS = (int)getnext (fptr); - if (NMS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n", + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", NMS, CI, CS)); com_p->n_msg_sizes = NMS; com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); @@ -179,37 +173,33 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** msg_p = &(com_p->msg_rules[nms]); - MS = getnext (fptr); - if (MS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->msg_size = (size_t)MS; - ALG = (int)getnext (fptr); - if (ALG<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_alg = ALG; - FANINOUT = (int)getnext (fptr); - if (FANINOUT<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_topo_faninout = FANINOUT; - SS = getnext (fptr); - if (SS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_segsize = SS; if (!nms && MS) { OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); goto on_file_error; } @@ -222,7 +212,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** } /* comm size */ total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI)); } /* per collective */ @@ -261,36 +251,3 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** return (-1); } - -static void skiptonewline (FILE *fptr) -{ - char val; - int rc; - - do { - rc = fread(&val, 1, 1, fptr); - if (0 == rc) return; - if ((1 == rc)&&('\n' == val)) { - fileline++; - return; - } - } while (1); -} - -static long getnext (FILE *fptr) -{ - long val; - int rc; - char trash; - - do { - rc = fscanf(fptr, "%li", &val); - if (rc == EOF) return MYEOF; - if (1 == rc) return val; - /* in all other cases, skip to the end */ - rc = fread(&trash, 1, 1, fptr); - if (rc == EOF) return MYEOF; - if ('\n' == trash) fileline++; - if ('#' == trash) skiptonewline (fptr); - } while (1); -} diff --git a/ompi/request/request.c b/ompi/request/request.c index a8ddb68ad3..abf33449d8 100644 --- a/ompi/request/request.c +++ b/ompi/request/request.c @@ -54,7 +54,7 @@ static void ompi_request_construct(ompi_request_t* req) /* don't call _INIT, we don't to set the request to _INACTIVE and there will * be no matching _FINI invocation */ req->req_state = OMPI_REQUEST_INVALID; - req->req_complete = false; + req->req_complete = REQUEST_COMPLETED; req->req_persistent = false; req->req_start = NULL; req->req_free = NULL; diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index efed62451a..6f9fdce277 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -380,7 +380,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, } complete_loop: assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { + if( (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) && (0 != iov_len_local) ) { unsigned char* temp = conv_ptr; /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. @@ -391,7 +391,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, 0, iov_len_local, &temp ); - + pConvertor->partial_length = iov_len_local; iov_len_local = 0; }