1
1

A complete overhaul of the HAN code.

Among many other things:
- Fix an imbalance bug in MPI_allgather
- Accept more human readable configuration files. We can now specify
  the collective by name instead of a magic number, and the component
  we want to use also by name.
- Add the capability to have optional arguments in the collective
  communication configuration file. Right now the capability exists
  for segment lengths, but is yet to be connected with the algorithms.
- Redo the initialization of all HAN collectives.

Cleanup the fallback collective support.
- In case the module is unable to deliver the expected result, it will fallback
  executing the collective operation on another collective component. This change
  make the support for this fallback simpler to use.
- Implement a fallback allowing a HAN module to remove itself as
  potential active collective module, and instead fallback to the
  next module in line.
- Completely disable the HAN modules on error. From the moment an error is
  encountered they remove themselves from the communicator, and in case some
  other modules calls them simply behave as a pass-through.

Communicator: provide ompi_comm_split_with_info to split and provide info at the same time
Add ompi_comm_coll_preference info key to control collective component selection

COLL HAN: use info keys instead of component-level variable to communicate topology level between abstraction layers
- The info value is a comma-separated list of entries, which are chosen with
  decreasing priorities. This overrides the priority of the component,
  unless the component has disqualified itself.
  An entry prefixed with ^ starts the ignore-list. Any entry following this
  character will be ingnored during the collective component selection for the
  communicator.
  Example: "sm,libnbc,^han,adapt" gives sm the highest preference, followed
  by libnbc. The components han and adapt are ignored in the selection process.
- Allocate a temporary buffer for all lower-level leaders (length 2 segments)
- Fix the handling of MPI_IN_PLACE for gather and scatter.

COLL HAN: Fix topology handling
 - HAN should not rely on node names to determine the ordering of ranks.
   Instead, use the node leaders as identifiers and short-cut if the
   node-leaders agree that ranks are consecutive. Also, error out if
   the rank distribution is imbalanced for now.

Signed-off-by: Xi Luo <xluo12@vols.utk.edu>
Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>

Conflicts:
	ompi/mca/coll/adapt/coll_adapt_ibcast.c
Этот коммит содержится в:
George Bosilca 2020-05-14 00:07:50 -04:00
родитель 94c817ceff
Коммит 6d735ba052
31 изменённых файлов: 2668 добавлений и 2974 удалений

Просмотреть файл

@ -400,11 +400,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group,
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/
/*
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
*/ opal_info_t *info,
int ompi_comm_split( ompi_communicator_t* comm, int color, int key, ompi_communicator_t **newcomm, bool pass_on_topo )
ompi_communicator_t **newcomm, bool pass_on_topo )
{ {
int myinfo[2]; int myinfo[2];
int size, my_size; int size, my_size;
@ -610,7 +609,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d", snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d",
newcomp->c_contextid, comm->c_contextid ); newcomp->c_contextid, comm->c_contextid );
/* Copy info if there is one */
if (info) {
newcomp->super.s_info = OBJ_NEW(opal_info_t);
opal_info_dup(info, &(newcomp->super.s_info));
}
/* Activate the communicator and init coll-component */ /* Activate the communicator and init coll-component */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
@ -637,6 +640,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
} }
/*
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
*/
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
ompi_communicator_t **newcomm, bool pass_on_topo )
{
return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo);
}
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/

Просмотреть файл

@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm,
OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key, OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key,
ompi_communicator_t** newcomm, bool pass_on_topo); ompi_communicator_t** newcomm, bool pass_on_topo);
/**
* split a communicator based on color and key. Parameters
* are identical to the MPI-counterpart of the function.
* Similar to \see ompi_comm_split with an additional info parameter.
*
* @param comm: input communicator
* @param color
* @param key
*
* @
*/
OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
opal_info_t *info,
ompi_communicator_t **newcomm, bool pass_on_topo );
/** /**
* split a communicator based on type and key. Parameters * split a communicator based on type and key. Parameters
* are identical to the MPI-counterpart of the function. * are identical to the MPI-counterpart of the function.

Просмотреть файл

@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group)
return false; return false;
} }
/**
* Count the number of processes on this group that share the same node as
* this process.
*/
int ompi_group_count_local_peers (ompi_group_t *group)
{
int local_peers = 0;
for (int i = 0 ; i < group->grp_proc_count ; ++i) {
ompi_proc_t *proc = NULL;
#if OMPI_GROUP_SPARSE
proc = ompi_group_peer_lookup (group, i);
#else
proc = ompi_group_get_proc_ptr_raw (group, i);
if (ompi_proc_is_sentinel (proc)) {
/* the proc must be stored in the group or cached in the proc
* hash table if the process resides in the local node
* (see ompi_proc_complete_init) */
continue;
}
#endif
if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
local_peers++;
}
}
return local_peers;
}

Просмотреть файл

@ -419,8 +419,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t
return ompi_group_get_proc_ptr (group, peer_id, false); return ompi_group_get_proc_ptr (group, peer_id, false);
} }
/**
* Return true if all processes in the group are not on the local node.
*/
bool ompi_group_have_remote_peers (ompi_group_t *group); bool ompi_group_have_remote_peers (ompi_group_t *group);
/**
* Count the number of processes on the local node.
*/
int ompi_group_count_local_peers (ompi_group_t *group);
/** /**
* Function to print the group info * Function to print the group info
*/ */

Просмотреть файл

@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req)
|| (context->con->tree->tree_nextsize > 0 && rank != context->con->root || (context->con->tree->tree_nextsize > 0 && rank != context->con->root
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs && num_sent == context->con->tree->tree_nextsize * context->con->num_segs
&& num_recv_fini == context->con->num_segs)) { && num_recv_fini == context->con->num_segs)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n",
ompi_comm_rank(context->con->comm))); ompi_comm_rank(context->con->comm)));
ibcast_request_fini(context); ibcast_request_fini(context);
} }
@ -304,7 +304,7 @@ static int recv_cb(ompi_request_t * req)
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs && num_sent == context->con->tree->tree_nextsize * context->con->num_segs
&& num_recv_fini == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 && num_recv_fini == context->con->num_segs) || (context->con->tree->tree_nextsize == 0
&& num_recv_fini == context->con->num_segs)) { && num_recv_fini == context->con->num_segs)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n",
ompi_comm_rank(context->con->comm))); ompi_comm_rank(context->con->comm)));
ibcast_request_fini(context); ibcast_request_fini(context);
} }

Просмотреть файл

@ -38,6 +38,7 @@
#include "mpi.h" #include "mpi.h"
#include "ompi/communicator/communicator.h" #include "ompi/communicator/communicator.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h" #include "opal/util/show_help.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "opal/class/opal_object.h" #include "opal/class/opal_object.h"
@ -312,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a,
return 0; return 0;
} }
static inline int
component_in_argv(char **argv, const char* component_name)
{
if( NULL != argv ) {
while( NULL != *argv ) {
if( 0 == strcmp(component_name, *argv) ) {
return 1;
}
argv++; /* move to the next argument */
}
}
return 0;
}
/* /*
* For each module in the list, check and see if it wants to run, and * For each module in the list, check and see if it wants to run, and
* do the resulting priority comparison. Make a list of modules to be * do the resulting priority comparison. Make a list of modules to be
@ -321,13 +336,66 @@ static int avail_coll_compare (opal_list_item_t **a,
static opal_list_t *check_components(opal_list_t * components, static opal_list_t *check_components(opal_list_t * components,
ompi_communicator_t * comm) ompi_communicator_t * comm)
{ {
int priority; int priority, flag;
const mca_base_component_t *component; const mca_base_component_t *component;
mca_base_component_list_item_t *cli; mca_base_component_list_item_t *cli;
mca_coll_base_module_2_3_0_t *module; mca_coll_base_module_2_3_0_t *module;
opal_list_t *selectable; opal_list_t *selectable;
mca_coll_base_avail_coll_t *avail; mca_coll_base_avail_coll_t *avail;
char info_val[OPAL_MAX_INFO_VAL+1];
char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL;
/* Check if this communicator comes with restrictions on the collective modules
* it wants to use. The restrictions are consistent with the MCA parameter
* to limit the collective components loaded, but it applies for each
* communicator and is provided as an info key during the communicator
* creation. Unlike the MCA param, this info key is used not to select
* components but either to prevent components from being used or to
* force a change in the component priority.
*/
if( NULL != comm->super.s_info) {
opal_info_get(comm->super.s_info, "ompi_comm_coll_preference",
sizeof(info_val), info_val, &flag);
if( !flag ) {
goto proceed_to_select;
}
coll_argv = opal_argv_split(info_val, ',');
if(NULL == coll_argv) {
goto proceed_to_select;
}
int idx2, count_include = opal_argv_count(coll_argv);
/* Allocate the coll_include argv */
coll_include = (char**)malloc((count_include + 1) * sizeof(char*));
coll_include[count_include] = NULL; /* NULL terminated array */
/* Dispatch the include/exclude in the corresponding arrays */
for( int idx = 0; NULL != coll_argv[idx]; idx++ ) {
if( '^' == coll_argv[idx][0] ) {
coll_include[idx] = NULL; /* NULL terminated array */
/* Allocate the coll_exclude argv */
coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*));
/* save the exclude components */
for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) {
coll_exclude[idx2 - idx] = coll_argv[idx2];
}
coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */
coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */
count_include = idx;
break;
}
coll_include[idx] = coll_argv[idx];
}
/* Reverse the order of the coll_inclide argv to faciliate the ordering of
* the selected components reverse.
*/
for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) {
char* temp = coll_include[idx2];
coll_include[idx2] = coll_include[count_include - 1];
coll_include[count_include - 1] = temp;
count_include--;
}
}
proceed_to_select:
/* Make a list of the components that query successfully */ /* Make a list of the components that query successfully */
selectable = OBJ_NEW(opal_list_t); selectable = OBJ_NEW(opal_list_t);
@ -335,6 +403,13 @@ static opal_list_t *check_components(opal_list_t * components,
OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) { OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) {
component = cli->cli_component; component = cli->cli_component;
/* dont bother is we have this component in the exclusion list */
if( component_in_argv(coll_exclude, component->mca_component_name) ) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:base:comm_select: component disqualified: %s (due to communicator info key)",
component->mca_component_name );
continue;
}
priority = check_one_component(comm, component, &module); priority = check_one_component(comm, component, &module);
if (priority >= 0) { if (priority >= 0) {
/* We have a component that indicated that it wants to run /* We have a component that indicated that it wants to run
@ -370,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components,
/* Put this list in priority order */ /* Put this list in priority order */
opal_list_sort(selectable, avail_coll_compare); opal_list_sort(selectable, avail_coll_compare);
/* For all valid component reorder them not on their provided priorities but on
* the order requested in the info key. As at this point the coll_include is
* already ordered backward we can simply prepend the components.
*/
mca_coll_base_avail_coll_t *item, *item_next;
OPAL_LIST_FOREACH_SAFE(item, item_next,
selectable, mca_coll_base_avail_coll_t) {
if( component_in_argv(coll_include, item->ac_component_name) ) {
opal_list_remove_item(selectable, &item->super);
opal_list_prepend(selectable, &item->super);
}
}
opal_argv_free(coll_argv);
if( NULL != coll_exclude ) {
free(coll_exclude);
}
if( NULL != coll_include ) {
free(coll_include);
}
/* All done */ /* All done */
return selectable; return selectable;
} }
@ -403,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm,
return priority; return priority;
} }
/************************************************************************** /**************************************************************************
* Query functions * Query functions
**************************************************************************/ **************************************************************************/

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University * Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -29,6 +29,8 @@
#include "ompi/mca/topo/base/base.h" #include "ompi/mca/topo/base/base.h"
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_base_util.h" #include "coll_base_util.h"
#include "coll_base_functions.h"
#include <ctype.h>
int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount,
ompi_datatype_t* sdatatype, ompi_datatype_t* sdatatype,
@ -268,7 +270,7 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
} else { } else {
scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm); scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
} }
for (int i=0; i<scount; i++) { for (int i=0; i<scount; i++) {
if (NULL != stypes && NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) { if (NULL != stypes && NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) {
OBJ_RETAIN(stypes[i]); OBJ_RETAIN(stypes[i]);
@ -297,7 +299,8 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) { static void nbc_req_cons(ompi_coll_base_nbc_request_t *req)
{
req->cb.req_complete_cb = NULL; req->cb.req_complete_cb = NULL;
req->req_complete_cb_data = NULL; req->req_complete_cb_data = NULL;
req->data.objs.objs[0] = NULL; req->data.objs.objs[0] = NULL;
@ -309,35 +312,249 @@ OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, N
/* File reading functions */ /* File reading functions */
static void skiptonewline (FILE *fptr, int *fileline) static void skiptonewline (FILE *fptr, int *fileline)
{ {
do { char val;
char val; int rc;
int rc;
do {
rc = fread(&val, 1, 1, fptr); rc = fread(&val, 1, 1, fptr);
if (0 == rc) return; if (0 == rc) {
if ((1 == rc)&&('\n' == val)) { return;
}
if ('\n' == val) {
(*fileline)++; (*fileline)++;
return; return;
} }
} while (1); } while (1);
} }
long ompi_coll_base_file_getnext (FILE *fptr, int *fileline) int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val)
{ {
do { char trash;
long val; int rc;
int rc;
char trash;
rc = fscanf(fptr, "%li", &val); do {
if (rc == EOF) return MYEOF; rc = fscanf(fptr, "%li", val);
if (1 == rc) return val; if (rc == EOF) {
/* in all other cases, skip to the end */ return -1;
}
if (1 == rc) {
return 0;
}
/* in all other cases, skip to the end of the token */
rc = fread(&trash, sizeof(char), 1, fptr); rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) return MYEOF; if (rc == EOF) {
return -1;
}
if ('\n' == trash) (*fileline)++; if ('\n' == trash) (*fileline)++;
if ('#' == trash) { if ('#' == trash) {
skiptonewline (fptr, fileline); skiptonewline (fptr, fileline);
} }
} while (1); } while (1);
} }
int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val)
{
char trash, token[32];
int rc;
*val = NULL; /* security in case we fail */
do {
rc = fscanf(fptr, "%32s", token);
if (rc == EOF) {
return -1;
}
if (1 == rc) {
if( '#' == token[0] ) {
skiptonewline(fptr, fileline);
continue;
}
*val = (char*)malloc(strlen(token) + 1);
strcpy(*val, token);
return 0;
}
/* in all other cases, skip to the end of the token */
rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) {
return -1;
}
if ('\n' == trash) (*fileline)++;
if ('#' == trash) {
skiptonewline (fptr, fileline);
}
} while (1);
}
int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val)
{
char trash;
int rc;
do {
rc = fscanf(fptr, "%" PRIsize_t, val);
if (rc == EOF) {
return -1;
}
if (1 == rc) {
return 0;
}
/* in all other cases, skip to the end of the token */
rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) {
return -1;
}
if ('\n' == trash) (*fileline)++;
if ('#' == trash) {
skiptonewline (fptr, fileline);
}
} while (1);
}
int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected)
{
char trash;
int rc;
do {
rc = fread(&trash, sizeof(char), 1, fptr);
if (0 == rc) { /* hit the end of the file */
return -1;
}
if ('\n' == trash) {
(*fileline)++;
continue;
}
if ('#' == trash) {
skiptonewline (fptr, fileline);
continue;
}
if( trash == expected )
return 1; /* return true and eat the char */
if( isblank(trash) ) /* skip all spaces if that's not what we were looking for */
continue;
if( 0 != fseek(fptr, -1, SEEK_CUR) )
return -1;
return 0;
} while (1);
}
/**
* There are certainly simpler implementation for this function when performance
* is not a critical point. But, as this function is used during the collective
* configuration, and we can do this configurations once for each communicator,
* I would rather have a more complex but faster implementation.
* The approach here is to search for the largest common denominators, to create
* something similar to a dichotomic search.
*/
int mca_coll_base_name_to_colltype(const char* name)
{
if( 'n' == name[0] ) {
if( 0 == strncmp(name, "neighbor_all", 12) ) {
if( 't' != name[12] ) {
if( 0 == strncmp(name+12, "gather", 6) ) {
if('\0' == name[18]) return NEIGHBOR_ALLGATHER;
if( 'v' == name[18]) return NEIGHBOR_ALLGATHERV;
}
} else {
if( 0 == strncmp(name+12, "toall", 5) ) {
if( '\0' == name[17] ) return NEIGHBOR_ALLTOALL;
if( 'v' == name[17] ) return NEIGHBOR_ALLTOALLV;
if( 'w' == name[17] ) return NEIGHBOR_ALLTOALLW;
}
}
}
return -1;
}
if( 'a' == name[0] ) {
if( 0 != strncmp(name, "all", 3) ) {
return -1;
}
if( 't' != name[3] ) {
if( 'r' == name[3] ) {
if( 0 == strcmp(name+3, "reduce") )
return ALLREDUCE;
} else {
if( 0 == strncmp(name+3, "gather", 6) ) {
if( '\0' == name[9] ) return ALLGATHER;
if( 'v' == name[9] ) return ALLGATHERV;
}
}
} else {
if( 0 == strncmp(name+3, "toall", 5) ) {
if( '\0' == name[8] ) return ALLTOALL;
if( 'v' == name[8] ) return ALLTOALLV;
if( 'w' == name[8] ) return ALLTOALLW;
}
}
return -1;
}
if( 'r' > name[0] ) {
if( 'b' == name[0] ) {
if( 0 == strcmp(name, "barrier") )
return BARRIER;
if( 0 == strcmp(name, "bcast") )
return BCAST;
} else if( 'g'== name[0] ) {
if( 0 == strncmp(name, "gather", 6) ) {
if( '\0' == name[6] ) return GATHER;
if( 'v' == name[6] ) return GATHERV;
}
}
if( 0 == strcmp(name, "exscan") )
return EXSCAN;
return -1;
}
if( 's' > name[0] ) {
if( 0 == strncmp(name, "reduce", 6) ) {
if( '\0' == name[6] ) return REDUCE;
if( '_' == name[6] ) {
if( 0 == strncmp(name+7, "scatter", 7) ) {
if( '\0' == name[14] ) return REDUCESCATTER;
if( 0 == strcmp(name+14, "_block") ) return REDUCESCATTERBLOCK;
}
}
}
return -1;
}
if( 0 == strcmp(name, "scan") )
return SCAN;
if( 0 == strcmp(name, "scatterv") )
return SCATTERV;
if( 0 == strcmp(name, "scatter") )
return SCATTER;
return -1;
}
/* conversion table for all COLLTYPE_T values defined in ompi/mca/coll/base/coll_base_functions.h */
static const char* colltype_translation_table[] = {
[ALLGATHER] = "allgather",
[ALLGATHERV] = "allgatherv",
[ALLREDUCE] = "allreduce",
[ALLTOALL] = "alltoall",
[ALLTOALLV] = "alltoallv",
[ALLTOALLW] = "alltoallw",
[BARRIER] = "barrier",
[BCAST] = "bcast",
[EXSCAN] = "exscan",
[GATHER] = "gather",
[GATHERV] = "gatherv",
[REDUCE] = "reduce",
[REDUCESCATTER] = "reduce_scatter",
[REDUCESCATTERBLOCK] = "reduce_scatter_block",
[SCAN] = "scan",
[SCATTER] = "scatter",
[SCATTERV] = "scatterv",
[NEIGHBOR_ALLGATHER] = "neighbor_allgather",
[NEIGHBOR_ALLGATHERV] = "neighbor_allgatherv",
[NEIGHBOR_ALLTOALL] = "neighbor_alltoall",
[NEIGHBOR_ALLTOALLV] = "neighbor_alltoallv",
[NEIGHBOR_ALLTOALLW] = "neighbor_alltoallw",
[COLLCOUNT] = NULL
};
char* mca_coll_base_colltype_to_str(int collid)
{
if( (collid < 0) || (collid >= COLLCOUNT) ) {
return NULL;
}
return strdup(colltype_translation_table[collid]);
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University * Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@ -178,8 +178,17 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
ompi_datatype_t *rtypes[]); ompi_datatype_t *rtypes[]);
/* File reading function */ /* File reading function */
#define MYEOF -999 int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val);
long ompi_coll_base_file_getnext(FILE *fptr, int *fileline); int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val);
int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val);
/* peek at the next valid token to see if it begins with the expected value. If yes
* eat the value, otherwise put it back into the file.
*/
int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected);
/* Miscelaneous function */
char* mca_coll_base_colltype_to_str(int collid);
int mca_coll_base_name_to_colltype(const char* name);
END_C_DECLS END_C_DECLS
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

Просмотреть файл

@ -26,8 +26,7 @@ coll_han_trigger.c \
coll_han_dynamic.c \ coll_han_dynamic.c \
coll_han_dynamic_file.c \ coll_han_dynamic_file.c \
coll_han_topo.c \ coll_han_topo.c \
coll_han_subcomms.c \ coll_han_subcomms.c
coll_han_utils.c
# Make the output library in this directory, and name it either # Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -20,9 +20,7 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "ompi/mca/coll/base/coll_base_functions.h" #include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
#include "ompi/mca/coll/han/coll_han_dynamic.h" #include "ompi/mca/coll/han/coll_han_dynamic.h"
BEGIN_C_DECLS
/* /*
* Today; * Today;
@ -33,131 +31,125 @@ BEGIN_C_DECLS
#define COLL_HAN_LOW_MODULES 2 #define COLL_HAN_LOW_MODULES 2
#define COLL_HAN_UP_MODULES 2 #define COLL_HAN_UP_MODULES 2
typedef struct { struct mca_coll_han_bcast_args_s {
uint32_t umod;
uint32_t lmod;
uint32_t fs;
uint32_t ualg;
uint32_t us;
} selection;
struct mca_bcast_argu_s {
mca_coll_task_t *cur_task; mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
void *buff; void *buff;
ompi_datatype_t *dtype;
int seg_count; int seg_count;
struct ompi_datatype_t *dtype;
int root_low_rank; int root_low_rank;
int root_up_rank; int root_up_rank;
struct ompi_communicator_t *up_comm;
struct ompi_communicator_t *low_comm;
int num_segments; int num_segments;
int cur_seg; int cur_seg;
int w_rank; int w_rank;
int last_seg_count; int last_seg_count;
bool noop; bool noop;
}; };
typedef struct mca_bcast_argu_s mca_bcast_argu_t; typedef struct mca_coll_han_bcast_args_s mca_coll_han_bcast_args_t;
struct mca_reduce_argu_s { struct mca_coll_han_reduce_args_s {
mca_coll_task_t *cur_task; mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
void *sbuf; void *sbuf;
void *rbuf; void *rbuf;
ompi_op_t *op;
ompi_datatype_t *dtype;
int seg_count; int seg_count;
struct ompi_datatype_t *dtype;
struct ompi_op_t *op;
int root_low_rank; int root_low_rank;
int root_up_rank; int root_up_rank;
struct ompi_communicator_t *up_comm;
struct ompi_communicator_t *low_comm;
int num_segments; int num_segments;
int cur_seg; int cur_seg;
int w_rank; int w_rank;
int last_seg_count; int last_seg_count;
bool noop; bool noop;
bool is_tmp_rbuf;
}; };
typedef struct mca_reduce_argu_s mca_reduce_argu_t; typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t;
struct mca_allreduce_argu_s { struct mca_coll_han_allreduce_args_s {
mca_coll_task_t *cur_task; mca_coll_task_t *cur_task;
void *sbuf; ompi_communicator_t *up_comm;
void *rbuf; ompi_communicator_t *low_comm;
int seg_count;
struct ompi_datatype_t *dtype;
struct ompi_op_t *op;
int root_up_rank;
int root_low_rank;
struct ompi_communicator_t *up_comm;
struct ompi_communicator_t *low_comm;
int num_segments;
int cur_seg;
int w_rank;
int last_seg_count;
bool noop;
ompi_request_t *req; ompi_request_t *req;
void *sbuf;
void *rbuf;
ompi_op_t *op;
ompi_datatype_t *dtype;
int seg_count;
int root_up_rank;
int root_low_rank;
int num_segments;
int cur_seg;
int w_rank;
int last_seg_count;
bool noop;
int *completed; int *completed;
}; };
typedef struct mca_allreduce_argu_s mca_allreduce_argu_t; typedef struct mca_coll_han_allreduce_args_s mca_coll_han_allreduce_args_t;
struct mca_scatter_argu_s { struct mca_coll_han_scatter_args_s {
mca_coll_task_t *cur_task; mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf; void *sbuf;
void *sbuf_inter_free; void *sbuf_inter_free;
void *sbuf_reorder_free; void *sbuf_reorder_free;
int scount;
struct ompi_datatype_t *sdtype;
void *rbuf; void *rbuf;
ompi_datatype_t *sdtype;
ompi_datatype_t *rdtype;
int scount;
int rcount; int rcount;
struct ompi_datatype_t *rdtype;
int root; int root;
int root_up_rank; int root_up_rank;
int root_low_rank; int root_low_rank;
struct ompi_communicator_t *up_comm;
struct ompi_communicator_t *low_comm;
int w_rank; int w_rank;
bool noop; bool noop;
ompi_request_t *req;
}; };
typedef struct mca_scatter_argu_s mca_scatter_argu_t; typedef struct mca_coll_han_scatter_args_s mca_coll_han_scatter_args_t;
struct mca_gather_argu_s { struct mca_coll_han_gather_args_s {
mca_coll_task_t *cur_task; mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf; void *sbuf;
void *sbuf_inter_free; void *sbuf_inter_free;
int scount;
struct ompi_datatype_t *sdtype;
void *rbuf; void *rbuf;
ompi_datatype_t *sdtype;
ompi_datatype_t *rdtype;
int scount;
int rcount; int rcount;
struct ompi_datatype_t *rdtype;
int root; int root;
int root_up_rank; int root_up_rank;
int root_low_rank; int root_low_rank;
struct ompi_communicator_t *up_comm;
struct ompi_communicator_t *low_comm;
int w_rank; int w_rank;
bool noop; bool noop;
ompi_request_t *req; bool is_mapbycore;
}; };
typedef struct mca_gather_argu_s mca_gather_argu_t; typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t;
struct mca_allgather_argu_s { struct mca_coll_han_allgather_s {
mca_coll_task_t *cur_task; mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf; void *sbuf;
void *sbuf_inter_free; void *sbuf_inter_free;
int scount;
struct ompi_datatype_t *sdtype;
void *rbuf; void *rbuf;
ompi_datatype_t *sdtype;
ompi_datatype_t *rdtype;
int scount;
int rcount; int rcount;
struct ompi_datatype_t *rdtype;
int root_low_rank; int root_low_rank;
struct ompi_communicator_t *up_comm;
struct ompi_communicator_t *low_comm;
int w_rank; int w_rank;
bool noop; bool noop;
bool is_mapbycore; bool is_mapbycore;
int *topo; int *topo;
ompi_request_t *req;
}; };
typedef struct mca_allgather_argu_s mca_allgather_argu_t; typedef struct mca_coll_han_allgather_s mca_coll_han_allgather_t;
/** /**
* Structure to hold the han coll component. First it holds the * Structure to hold the han coll component. First it holds the
@ -184,7 +176,7 @@ typedef struct mca_coll_han_component_t {
/* up level module for reduce */ /* up level module for reduce */
uint32_t han_reduce_up_module; uint32_t han_reduce_up_module;
/* low level module for reduce */ /* low level module for reduce */
uint32_t han_reduce_low_module; uint32_t han_reduce_low_module;
/* segment size for allreduce */ /* segment size for allreduce */
uint32_t han_allreduce_segsize; uint32_t han_allreduce_segsize;
/* up level module for allreduce */ /* up level module for allreduce */
@ -203,21 +195,10 @@ typedef struct mca_coll_han_component_t {
uint32_t han_scatter_up_module; uint32_t han_scatter_up_module;
/* low level module for scatter */ /* low level module for scatter */
uint32_t han_scatter_low_module; uint32_t han_scatter_low_module;
/* whether enable auto tune */
uint32_t han_auto_tune;
/* whether we need reproducible results /* whether we need reproducible results
* (but disables topological optimisations) * (but disables topological optimisations)
*/ */
uint32_t han_reproducible; uint32_t han_reproducible;
/* create a 3D array
* num_processes (n): 2 4 8 16 32 64 (6)
* num_core (c): 2 4 8 12 (4)
* message size (m): 1 - 4194304 (23)
*/
uint32_t han_auto_tune_n;
uint32_t han_auto_tune_c;
uint32_t han_auto_tune_m;
selection *han_auto_tuned;
bool use_simple_algorithm[COLLCOUNT]; bool use_simple_algorithm[COLLCOUNT];
/* Dynamic configuration rules */ /* Dynamic configuration rules */
@ -228,7 +209,6 @@ typedef struct mca_coll_han_component_t {
mca_coll_han_dynamic_rules_t dynamic_rules; mca_coll_han_dynamic_rules_t dynamic_rules;
/* Dynamic rules from mca parameter */ /* Dynamic rules from mca parameter */
COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL];
int topo_level;
/* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */
int max_dynamic_errors; int max_dynamic_errors;
@ -240,7 +220,7 @@ typedef void (*previous_dummy_fn_t) (void);
* Structure used to store what is necessary for the collective operations * Structure used to store what is necessary for the collective operations
* routines in case of fallback. * routines in case of fallback.
*/ */
typedef struct collective_fallback_t { typedef struct mca_coll_han_single_collective_fallback_s {
union { union {
mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_allgather_fn_t allgather;
mca_coll_base_module_allgatherv_fn_t allgatherv; mca_coll_base_module_allgatherv_fn_t allgatherv;
@ -250,9 +230,24 @@ typedef struct collective_fallback_t {
mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_reduce_fn_t reduce;
mca_coll_base_module_scatter_fn_t scatter; mca_coll_base_module_scatter_fn_t scatter;
previous_dummy_fn_t dummy; previous_dummy_fn_t dummy;
} previous_routine; };
mca_coll_base_module_t *previous_module; mca_coll_base_module_t* module;
} collective_fallback_t; } mca_coll_han_single_collective_fallback_t;
/*
* The structure containing a replacement for all collective supported
* by HAN. This structure is used as a fallback during subcommunicator
* creation.
*/
typedef struct mca_coll_han_collectives_fallback_s {
mca_coll_han_single_collective_fallback_t allgather;
mca_coll_han_single_collective_fallback_t allgatherv;
mca_coll_han_single_collective_fallback_t allreduce;
mca_coll_han_single_collective_fallback_t bcast;
mca_coll_han_single_collective_fallback_t reduce;
mca_coll_han_single_collective_fallback_t gather;
mca_coll_han_single_collective_fallback_t scatter;
} mca_coll_han_collectives_fallback_t;
/** Coll han module */ /** Coll han module */
typedef struct mca_coll_han_module_t { typedef struct mca_coll_han_module_t {
@ -262,7 +257,6 @@ typedef struct mca_coll_han_module_t {
/* Whether this module has been lazily initialized or not yet */ /* Whether this module has been lazily initialized or not yet */
bool enabled; bool enabled;
struct ompi_communicator_t *cached_comm;
struct ompi_communicator_t **cached_low_comms; struct ompi_communicator_t **cached_low_comms;
struct ompi_communicator_t **cached_up_comms; struct ompi_communicator_t **cached_up_comms;
int *cached_vranks; int *cached_vranks;
@ -271,7 +265,7 @@ typedef struct mca_coll_han_module_t {
bool are_ppn_imbalanced; bool are_ppn_imbalanced;
/* To be able to fallback when the cases are not supported */ /* To be able to fallback when the cases are not supported */
struct collective_fallback_t previous_routines[COLLCOUNT]; struct mca_coll_han_collectives_fallback_s fallback;
/* To be able to fallback on reproducible algorithm */ /* To be able to fallback on reproducible algorithm */
mca_coll_base_module_reduce_fn_t reproducible_reduce; mca_coll_base_module_reduce_fn_t reproducible_reduce;
@ -280,7 +274,7 @@ typedef struct mca_coll_han_module_t {
mca_coll_base_module_t *reproducible_allreduce_module; mca_coll_base_module_t *reproducible_allreduce_module;
/* Topological level of this communicator */ /* Topological level of this communicator */
int topologic_level; TOPO_LVL_T topologic_level;
/* Collective module storage for module choice */ /* Collective module storage for module choice */
mca_coll_han_collective_modules_storage_t modules_storage; mca_coll_han_collective_modules_storage_t modules_storage;
@ -302,21 +296,53 @@ OBJ_CLASS_DECLARATION(mca_coll_han_module_t);
* Some defines to stick to the naming used in the other components in terms of * Some defines to stick to the naming used in the other components in terms of
* fallback routines * fallback routines
*/ */
#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather #define previous_allgather fallback.allgather.allgather
#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv #define previous_allgather_module fallback.allgather.module
#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce
#define previous_bcast previous_routines[BCAST].previous_routine.bcast #define previous_allgatherv fallback.allgatherv.allgatherv
#define previous_gather previous_routines[GATHER].previous_routine.gather #define previous_allgatherv_module fallback.allgatherv.module
#define previous_reduce previous_routines[REDUCE].previous_routine.reduce
#define previous_scatter previous_routines[SCATTER].previous_routine.scatter #define previous_allreduce fallback.allreduce.allreduce
#define previous_allreduce_module fallback.allreduce.module
#define previous_bcast fallback.bcast.bcast
#define previous_bcast_module fallback.bcast.module
#define previous_reduce fallback.reduce.reduce
#define previous_reduce_module fallback.reduce.module
#define previous_gather fallback.gather.gather
#define previous_gather_module fallback.gather.module
#define previous_scatter fallback.scatter.scatter
#define previous_scatter_module fallback.scatter.module
/* macro to correctly load a fallback collective module */
#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \
do { \
if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \
(COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \
mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \
OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \
OBJ_RELEASE(coll_module); \
} \
} while(0)
/* macro to correctly load /all/ fallback collectives */
#define HAN_LOAD_FALLBACK_COLLECTIVES(HANM, COMM) \
do { \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, bcast); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, scatter); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, gather); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, reduce); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allreduce); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgather); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgatherv); \
han_module->enabled = false; /* entire module set to pass-through from now on */ \
} while(0)
#define previous_allgather_module previous_routines[ALLGATHER].previous_module
#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module
#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module
#define previous_bcast_module previous_routines[BCAST].previous_module
#define previous_gather_module previous_routines[GATHER].previous_module
#define previous_reduce_module previous_routines[REDUCE].previous_module
#define previous_scatter_module previous_routines[SCATTER].previous_module
/** /**
* Global component instance * Global component instance
@ -333,20 +359,30 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm
int han_request_free(ompi_request_t ** request); int han_request_free(ompi_request_t ** request);
/* Subcommunicator creation */ /* Subcommunicator creation */
void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); int mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module);
void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module);
/* Gather topology information */
/**
* Gather topology information
*
* Returns a pointer to the (potentially already cached) topology.
* NOTE: if the rank distribution is imbalanced, no effort will be made to gather
* the topology at all ranks and instead NULL is returned and han_module->is_mapbycore
* is set to false.
* If HAN ever learns to deal with imbalanced topologies, this needs fixing!
*/
int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module,
int num_topo_level); int num_topo_level);
/* Utils */ /* Utils */
void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, static inline void
int *root_up_rank); mca_coll_han_get_ranks(int *vranks, int root, int low_size,
uint32_t han_auto_tuned_get_n(uint32_t n); int *root_low_rank, int *root_up_rank)
uint32_t han_auto_tuned_get_c(uint32_t c); {
uint32_t han_auto_tuned_get_m(uint32_t m); *root_up_rank = vranks[root] / low_size;
*root_low_rank = vranks[root] % low_size;
}
const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll);
const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl);
/** Dynamic component choice */ /** Dynamic component choice */
@ -356,7 +392,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl);
*/ */
int int
mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module); mca_coll_han_module_t *han_module);
int int
mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS,
@ -382,22 +418,13 @@ mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS,
/* Bcast */ /* Bcast */
int mca_coll_han_bcast_intra_simple(void *buff, int mca_coll_han_bcast_intra_simple(void *buff,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff,
int seg_count, struct ompi_datatype_t *dtype,
int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments, int cur_seg, int w_rank, int last_seg_count,
bool noop);
int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module); struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int mca_coll_han_bcast_t0_task(void *task_argu);
int mca_coll_han_bcast_t1_task(void *task_argu);
/* Reduce */ /* Reduce */
int int
@ -422,145 +449,75 @@ mca_coll_han_reduce_reproducible(const void *sbuf,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
int mca_coll_han_reduce_intra(const void *sbuf,
void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task,
void *sbuf,
void *rbuf, int seg_count, struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments, int cur_seg, int w_rank, int last_seg_count,
bool noop);
int mca_coll_han_reduce_intra(const void *sbuf,
void *rbuf, void *rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
ompi_op_t* op, ompi_op_t* op,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t * module); mca_coll_base_module_t * module);
int mca_coll_han_reduce_t0_task(void *task_argu);
int mca_coll_han_reduce_t1_task(void *task_argu);
/* Allreduce */ /* Allreduce */
int int
mca_coll_han_allreduce_intra_simple(const void *sbuf, mca_coll_han_allreduce_intra_simple(const void *sbuf,
void *rbuf, void *rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
int int
mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
int int
mca_coll_han_allreduce_reproducible(const void *sbuf, mca_coll_han_allreduce_reproducible(const void *sbuf,
void *rbuf, void *rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu,
mca_coll_task_t * cur_task,
void *sbuf,
void *rbuf,
int seg_count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root_up_rank,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments,
int cur_seg,
int w_rank,
int last_seg_count,
bool noop, ompi_request_t * req, int *completed);
int mca_coll_han_allreduce_intra(const void *sbuf, int mca_coll_han_allreduce_intra(const void *sbuf,
void *rbuf, void *rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module); struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int mca_coll_han_allreduce_t0_task(void *task_argu);
int mca_coll_han_allreduce_t1_task(void *task_argu);
int mca_coll_han_allreduce_t2_task(void *task_argu);
int mca_coll_han_allreduce_t3_task(void *task_argu);
/* Scatter */ /* Scatter */
int int
mca_coll_han_scatter_intra(const void *sbuf, int scount, mca_coll_han_scatter_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int mca_coll_han_scatter_us_task(void *task_argu);
int mca_coll_han_scatter_ls_task(void *task_argu);
void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu,
mca_coll_task_t * cur_task,
void *sbuf,
void *sbuf_inter_free,
void *sbuf_reorder_free,
int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
struct ompi_datatype_t *rdtype,
int root,
int root_up_rank,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank, bool noop, ompi_request_t * req);
/* Gather */
int
mca_coll_han_gather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
int root, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module); struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int mca_coll_han_gather_lg_task(void *task_argu);
int mca_coll_han_gather_ug_task(void *task_argu); /* Gather */
void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, int
mca_coll_task_t * cur_task, mca_coll_han_gather_intra(const void *sbuf, int scount,
void *sbuf, struct ompi_datatype_t *sdtype,
void *sbuf_inter_free, void *rbuf, int rcount,
int scount, struct ompi_datatype_t *rdtype,
struct ompi_datatype_t *sdtype, int root,
void *rbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int rcount,
struct ompi_datatype_t *rdtype,
int root,
int root_up_rank,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank, bool noop, ompi_request_t * req);
int int
mca_coll_han_gather_intra_simple(const void *sbuf, int scount, mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
/* reordering after gather, for unordered ranks */ /* reordering after gather, for unordered ranks */
void void
ompi_coll_han_reorder_gather(const void *sbuf, ompi_coll_han_reorder_gather(const void *sbuf,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
int * topo); int * topo);
@ -571,30 +528,12 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module); struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int mca_coll_han_allgather_lg_task(void *task_argu);
int mca_coll_han_allgather_uag_task(void *task_argu);
int mca_coll_han_allgather_lb_task(void *task_argu);
void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu,
mca_coll_task_t * cur_task,
void *sbuf,
void *sbuf_inter_free,
int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
struct ompi_datatype_t *rdtype,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank,
bool noop, bool is_mapbycore, int *topo, ompi_request_t * req);
int int
mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
void* rbuf, int rcount, void* rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); mca_coll_base_module_t *module);
END_C_DECLS
#endif /* MCA_COLL_HAN_EXPORT_H */ #endif /* MCA_COLL_HAN_EXPORT_H */

Просмотреть файл

@ -16,40 +16,45 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, static int mca_coll_han_allgather_lb_task(void *task_args);
mca_coll_task_t * cur_task, static int mca_coll_han_allgather_lg_task(void *task_args);
void *sbuf, static int mca_coll_han_allgather_uag_task(void *task_args);
void *sbuf_inter_free,
int scount, static inline void
struct ompi_datatype_t *sdtype, mca_coll_han_set_allgather_args(mca_coll_han_allgather_t * args,
void *rbuf, mca_coll_task_t * cur_task,
int rcount, void *sbuf,
struct ompi_datatype_t *rdtype, void *sbuf_inter_free,
int root_low_rank, int scount,
struct ompi_communicator_t *up_comm, struct ompi_datatype_t *sdtype,
struct ompi_communicator_t *low_comm, void *rbuf,
int w_rank, int rcount,
bool noop, struct ompi_datatype_t *rdtype,
bool is_mapbycore, int root_low_rank,
int *topo, struct ompi_communicator_t *up_comm,
ompi_request_t * req) struct ompi_communicator_t *low_comm,
int w_rank,
bool noop,
bool is_mapbycore,
int *topo,
ompi_request_t * req)
{ {
argu->cur_task = cur_task; args->cur_task = cur_task;
argu->sbuf = sbuf; args->sbuf = sbuf;
argu->sbuf_inter_free = sbuf_inter_free; args->sbuf_inter_free = sbuf_inter_free;
argu->scount = scount; args->scount = scount;
argu->sdtype = sdtype; args->sdtype = sdtype;
argu->rbuf = rbuf; args->rbuf = rbuf;
argu->rcount = rcount; args->rcount = rcount;
argu->rdtype = rdtype; args->rdtype = rdtype;
argu->root_low_rank = root_low_rank; args->root_low_rank = root_low_rank;
argu->up_comm = up_comm; args->up_comm = up_comm;
argu->low_comm = low_comm; args->low_comm = low_comm;
argu->w_rank = w_rank; args->w_rank = w_rank;
argu->noop = noop; args->noop = noop;
argu->is_mapbycore = is_mapbycore; args->is_mapbycore = is_mapbycore;
argu->topo = topo; args->topo = topo;
argu->req = req; args->req = req;
} }
int int
@ -60,44 +65,52 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t * module) mca_coll_base_module_t * module)
{ {
int w_rank;
w_rank = ompi_comm_rank(comm);
/* Create the subcommunicators */ /* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
mca_coll_han_comm_create_new(comm, han_module); if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather within this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
int low_rank = ompi_comm_rank(low_comm); int low_rank = ompi_comm_rank(low_comm);
int w_rank = ompi_comm_rank(comm);
/* Init topo */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* unbalanced case needs algo adaptation */
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather with this communicator (imbalance). Fall back on another component\n"));
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
ompi_request_t *temp_request = NULL; ompi_request_t *temp_request = NULL;
/* Set up request */ /* Set up request */
temp_request = OBJ_NEW(ompi_request_t); temp_request = OBJ_NEW(ompi_request_t);
OMPI_REQUEST_INIT(temp_request, false);
temp_request->req_state = OMPI_REQUEST_ACTIVE; temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = 0; temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = han_request_free; temp_request->req_free = han_request_free;
temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status = (ompi_status_public_t){0};
temp_request->req_status.MPI_TAG = 0; temp_request->req_complete = REQUEST_PENDING;
temp_request->req_status.MPI_ERROR = 0;
temp_request->req_status._cancelled = 0;
temp_request->req_status._ucount = 0;
/* Init topo */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
int root_low_rank = 0; int root_low_rank = 0;
/* Create lg (lower level gather) task */ /* Create lg (lower level gather) task */
mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t);
/* Setup lg task arguments */ /* Setup lg task arguments */
mca_allgather_argu_t *lg_argu = malloc(sizeof(mca_allgather_argu_t)); mca_coll_han_allgather_t *lg_args = malloc(sizeof(mca_coll_han_allgather_t));
mac_coll_han_set_allgather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, mca_coll_han_set_allgather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount,
rdtype, root_low_rank, up_comm, low_comm, w_rank, rdtype, root_low_rank, up_comm, low_comm, w_rank,
low_rank != root_low_rank, han_module->is_mapbycore, topo, low_rank != root_low_rank, han_module->is_mapbycore, topo,
temp_request); temp_request);
/* Init lg task */ /* Init and issue lg task */
init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_argu)); init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_args));
/* Issure lg task */
issue_task(lg); issue_task(lg);
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
@ -105,48 +118,70 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* lg: lower level (shared memory) gather task */ /* lg: lower level gather task */
int mca_coll_han_allgather_lg_task(void *task_argu) int mca_coll_han_allgather_lg_task(void *task_args)
{ {
mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args;
char *tmp_buf = NULL, *tmp_rbuf = NULL;
char *tmp_send = NULL;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n",
t->w_rank)); t->w_rank));
OBJ_RELEASE(t->cur_task);
/* If the process is one of the node leader */ /* If the process is one of the node leader */
char *tmp_buf = NULL; ptrdiff_t rlb, rext;
char *tmp_rbuf = NULL; ompi_datatype_get_extent (t->rdtype, &rlb, &rext);
if (MPI_IN_PLACE == t->sbuf) {
t->sdtype = t->rdtype;
t->scount = t->rcount;
}
if (!t->noop) { if (!t->noop) {
int low_size = ompi_comm_size(t->low_comm); int low_size = ompi_comm_size(t->low_comm);
ptrdiff_t rsize, rgap = 0; ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap);
tmp_buf = (char *) malloc(rsize); tmp_buf = (char *) malloc(rsize);
tmp_rbuf = tmp_buf - rgap; tmp_rbuf = tmp_buf - rgap;
if (MPI_IN_PLACE == t->sbuf) {
tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext;
ompi_datatype_copy_content_same_ddt(t->rdtype, t->rcount, tmp_rbuf, tmp_send);
}
} }
/* Shared memory gather */ /* Lower level (shared memory or intra-node) gather */
t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, if (MPI_IN_PLACE == t->sbuf) {
t->rdtype, t->root_low_rank, t->low_comm, if (!t->noop) {
t->low_comm->c_coll->coll_gather_module); t->low_comm->c_coll->coll_gather(MPI_IN_PLACE, t->scount, t->sdtype,
tmp_rbuf, t->rcount, t->rdtype, t->root_low_rank,
t->low_comm, t->low_comm->c_coll->coll_gather_module);
}
else {
tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext;
t->low_comm->c_coll->coll_gather(tmp_send, t->rcount, t->rdtype,
NULL, t->rcount, t->rdtype, t->root_low_rank,
t->low_comm, t->low_comm->c_coll->coll_gather_module);
}
}
else {
t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount,
t->rdtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_gather_module);
}
t->sbuf = tmp_rbuf; t->sbuf = tmp_rbuf;
t->sbuf_inter_free = tmp_buf; t->sbuf_inter_free = tmp_buf;
/* Create uag (upper level all-gather) task */ /* Create uag (upper level all-gather) task */
mca_coll_task_t *uag = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *uag = t->cur_task;
/* Setup uag task arguments */ /* Init and issue uag task */
t->cur_task = uag;
/* Init uag task */
init_task(uag, mca_coll_han_allgather_uag_task, (void *) t); init_task(uag, mca_coll_han_allgather_uag_task, (void *) t);
/* Issure uag task */
issue_task(uag); issue_task(uag);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* uag: upper level (inter-node) all-gather task */ /* uag: upper level (inter-node) all-gather task */
int mca_coll_han_allgather_uag_task(void *task_argu) int mca_coll_han_allgather_uag_task(void *task_args)
{ {
mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args;
OBJ_RELEASE(t->cur_task);
if (t->noop) { if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
@ -213,21 +248,18 @@ int mca_coll_han_allgather_uag_task(void *task_argu)
/* Create lb (low level broadcast) task */ /* Create lb (low level broadcast) task */
mca_coll_task_t *lb = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *lb = t->cur_task;
/* Setup lb task arguments */ /* Init and issue lb task */
t->cur_task = lb;
/* Init lb task */
init_task(lb, mca_coll_han_allgather_lb_task, (void *) t); init_task(lb, mca_coll_han_allgather_lb_task, (void *) t);
/* Issure lb task */
issue_task(lb); issue_task(lb);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* lb: low level (shared-memory) broadcast task */ /* lb: low level broadcast task */
int mca_coll_han_allgather_lb_task(void *task_argu) int mca_coll_han_allgather_lb_task(void *task_args)
{ {
mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n",
t->w_rank)); t->w_rank));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
@ -246,30 +278,41 @@ int mca_coll_han_allgather_lb_task(void *task_argu)
int int
mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
void* rbuf, int rcount, void* rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module){ mca_coll_base_module_t *module){
/* create the subcommunicators */ /* create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
mca_coll_han_comm_create_new(comm, han_module);
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather within this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
/* discovery topology */ /* discovery topology */
int *topo = mca_coll_han_topo_init(comm, han_module, 2); int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* unbalanced case needs algo adaptation */ /* unbalanced case needs algo adaptation */
if (han_module->are_ppn_imbalanced){ if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather with this communicator. It need to fall back on another component\n")); "han cannot handle allgather within this communicator (imbalance). Fall back on another component\n"));
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, /* Put back the fallback collective support and call it once. All
rcount, rdtype, * future calls will then be automatically redirected.
comm, han_module->previous_allgather_module); */
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
} }
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
int w_rank = ompi_comm_rank(comm);
/* setup up/low coordinates */ /* setup up/low coordinates */
int low_rank = ompi_comm_rank(low_comm); int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm); int low_size = ompi_comm_size(low_comm);
@ -279,27 +322,54 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
/* allocate the intermediary buffer /* allocate the intermediary buffer
* to gather on leaders on the low sub communicator */ * to gather on leaders on the low sub communicator */
ptrdiff_t rlb, rext;
ompi_datatype_get_extent (rdtype, &rlb, &rext);
char *tmp_buf = NULL; char *tmp_buf = NULL;
char *tmp_buf_start = NULL; char *tmp_buf_start = NULL;
char *tmp_send = NULL;
if (MPI_IN_PLACE == sbuf) {
scount = rcount;
sdtype = rdtype;
}
if (low_rank == root_low_rank) { if (low_rank == root_low_rank) {
ptrdiff_t rsize, rgap = 0; ptrdiff_t rsize, rgap = 0;
/* Compute the size to receive all the local data, including datatypes empty gaps */ /* Compute the size to receive all the local data, including datatypes empty gaps */
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap);
// intermediary buffer on node leaders to gather on low comm /* intermediary buffer on node leaders to gather on low comm */
tmp_buf = (char *) malloc(rsize); tmp_buf = (char *) malloc(rsize);
tmp_buf_start = tmp_buf - rgap; tmp_buf_start = tmp_buf - rgap;
if (MPI_IN_PLACE == sbuf) {
tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext;
ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send);
}
} }
/* 1. low gather on node leaders into tmp_buf */ /* 1. low gather on node leaders into tmp_buf */
low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, if (MPI_IN_PLACE == sbuf) {
tmp_buf_start, rcount, rdtype, root_low_rank, if (low_rank == root_low_rank) {
low_comm, low_comm->c_coll->coll_gather_module); low_comm->c_coll->coll_gather(MPI_IN_PLACE, scount, sdtype,
tmp_buf_start, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
}
else {
tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext;
low_comm->c_coll->coll_gather(tmp_send, rcount, rdtype,
NULL, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
}
}
else {
low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype,
tmp_buf_start, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
}
/* 2. allgather between node leaders, from tmp_buf to reorder_buf */ /* 2. allgather between node leaders, from tmp_buf to reorder_buf */
if (low_rank == root_low_rank) { if (low_rank == root_low_rank) {
/* allocate buffer to store unordered result on node leaders /* allocate buffer to store unordered result on node leaders
* * if the processes are mapped-by core, no need to reorder: * if the processes are mapped-by core, no need to reorder:
* * distribution of ranks on core first and node next, * distribution of ranks on core first and node next,
* * in a increasing order for both patterns */ * in a increasing order for both patterns.
*/
char *reorder_buf = NULL; char *reorder_buf = NULL;
char *reorder_buf_start = NULL; char *reorder_buf_start = NULL;
if (han_module->is_mapbycore) { if (han_module->is_mapbycore) {
@ -307,7 +377,7 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
} else { } else {
if (0 == low_rank && 0 == up_rank) { // first rank displays message if (0 == low_rank && 0 == up_rank) { // first rank displays message
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Allgather needs reordering: ", w_rank)); "[%d]: Future Allgather needs reordering: ", up_rank));
} }
ptrdiff_t rsize, rgap = 0; ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap);
@ -332,8 +402,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
*/ */
if (!han_module->is_mapbycore) { if (!han_module->is_mapbycore) {
ompi_coll_han_reorder_gather(reorder_buf_start, ompi_coll_han_reorder_gather(reorder_buf_start,
rbuf, rcount, rdtype, rbuf, rcount, rdtype,
comm, topo); comm, topo);
free(reorder_buf); free(reorder_buf);
reorder_buf = NULL; reorder_buf = NULL;
} }
@ -347,4 +417,4 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -17,46 +17,52 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
static int mca_coll_han_allreduce_t0_task(void *task_args);
static int mca_coll_han_allreduce_t1_task(void *task_args);
static int mca_coll_han_allreduce_t2_task(void *task_args);
static int mca_coll_han_allreduce_t3_task(void *task_args);
/* Only work with regular situation (each node has equal number of processes) */ /* Only work with regular situation (each node has equal number of processes) */
void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, static inline void
mca_coll_task_t * cur_task, mca_coll_han_set_allreduce_args(mca_coll_han_allreduce_args_t * args,
void *sbuf, mca_coll_task_t * cur_task,
void *rbuf, void *sbuf,
int seg_count, void *rbuf,
struct ompi_datatype_t *dtype, int seg_count,
struct ompi_op_t *op, struct ompi_datatype_t *dtype,
int root_up_rank, struct ompi_op_t *op,
int root_low_rank, int root_up_rank,
struct ompi_communicator_t *up_comm, int root_low_rank,
struct ompi_communicator_t *low_comm, struct ompi_communicator_t *up_comm,
int num_segments, struct ompi_communicator_t *low_comm,
int cur_seg, int num_segments,
int w_rank, int cur_seg,
int last_seg_count, int w_rank,
bool noop, ompi_request_t * req, int *completed) int last_seg_count,
bool noop, ompi_request_t * req, int *completed)
{ {
argu->cur_task = cur_task; args->cur_task = cur_task;
argu->sbuf = sbuf; args->sbuf = sbuf;
argu->rbuf = rbuf; args->rbuf = rbuf;
argu->seg_count = seg_count; args->seg_count = seg_count;
argu->dtype = dtype; args->dtype = dtype;
argu->op = op; args->op = op;
argu->root_up_rank = root_up_rank; args->root_up_rank = root_up_rank;
argu->root_low_rank = root_low_rank; args->root_low_rank = root_low_rank;
argu->up_comm = up_comm; args->up_comm = up_comm;
argu->low_comm = low_comm; args->low_comm = low_comm;
argu->num_segments = num_segments; args->num_segments = num_segments;
argu->cur_seg = cur_seg; args->cur_seg = cur_seg;
argu->w_rank = w_rank; args->w_rank = w_rank;
argu->last_seg_count = last_seg_count; args->last_seg_count = last_seg_count;
argu->noop = noop; args->noop = noop;
argu->req = req; args->req = req;
argu->completed = completed; args->completed = completed;
} }
/* /*
* Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce:
* lr: lower level (shared-memory or intra-node) reduce, * lr: lower level (shared-memory or intra-node) reduce,
* ur: upper level (inter-node) reduce, * ur: upper level (inter-node) reduce,
* ub: upper level (inter-node) bcast, * ub: upper level (inter-node) bcast,
@ -80,72 +86,40 @@ mca_coll_han_allreduce_intra(const void *sbuf,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module) struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{ {
// Fallback to another component if the op cannot commute
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
if (! ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this communicator."
"It need to fall back on another component\n"));
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
comm, han_module->previous_allreduce_module);
}
/* No support for non-commutative operations */
ptrdiff_t extent, lb; if(!ompi_op_is_commute(op)) {
ompi_datatype_get_extent(dtype, &lb, &extent); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
int w_rank; "han cannot handle allreduce with this operation. Fall back on another component\n"));
w_rank = ompi_comm_rank(comm); goto prev_allreduce_intra;
int seg_count = count; }
size_t typelng;
ompi_datatype_type_size(dtype, &typelng);
/* Create the subcommunicators */ /* Create the subcommunicators */
mca_coll_han_comm_create(comm, han_module); if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
comm, comm->c_coll->coll_reduce_module);
}
ptrdiff_t extent, lb;
size_t dtype_size;
ompi_datatype_get_extent(dtype, &lb, &extent);
int seg_count = count, w_rank;
w_rank = ompi_comm_rank(comm);
ompi_datatype_type_size(dtype, &dtype_size);
ompi_communicator_t *low_comm; ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm; ompi_communicator_t *up_comm;
/* Auto tune is enabled */
if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { /* use MCA parameters for now */
uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); low_comm = han_module->cached_low_comms[mca_coll_han_component.han_allreduce_low_module];
uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); up_comm = han_module->cached_up_comms[mca_coll_han_component.han_allreduce_up_module];
uint32_t m = han_auto_tuned_get_m(typelng * count); COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, dtype_size,
uint32_t id = seg_count);
n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m +
c * mca_coll_han_component.han_auto_tune_m + m +
mca_coll_han_component.han_auto_tune_n * mca_coll_han_component.han_auto_tune_c *
mca_coll_han_component.han_auto_tune_m;
uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod;
uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod;
uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs;
/* ualg and us are only available when using ADAPT */
/*
uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg;
uint32_t us = mca_coll_han_component.han_auto_tuned[id].us;
*/
/* Set up umod */
up_comm = han_module->cached_up_comms[umod];
/* Set up lmod */
low_comm = han_module->cached_low_comms[lmod];
/* Set up fs */
COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count);
/* Set up ualg and us, which is only available when using ADAPT */
/*
if (umod == 1) {
((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component->
adapt_ibcast_algorithm = ualg;
((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component->
adapt_ibcast_algorithm = ualg;
((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component->
adapt_ibcast_segment_size = us;
((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component->
adapt_ibcast_segment_size = us;
}
*/
} else {
low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module];
up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module];
COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, typelng,
seg_count);
}
/* Determine number of elements sent per task. */ /* Determine number of elements sent per task. */
OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output,
@ -161,8 +135,8 @@ mca_coll_han_allreduce_intra(const void *sbuf,
/* Setup up t0 task arguments */ /* Setup up t0 task arguments */
int *completed = (int *) malloc(sizeof(int)); int *completed = (int *) malloc(sizeof(int));
completed[0] = 0; completed[0] = 0;
mca_allreduce_argu_t *t = malloc(sizeof(mca_allreduce_argu_t)); mca_coll_han_allreduce_args_t *t = malloc(sizeof(mca_coll_han_allreduce_args_t));
mac_coll_han_set_allreduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, mca_coll_han_set_allreduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op,
root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0,
w_rank, count - (num_segments - 1) * seg_count, w_rank, count - (num_segments - 1) * seg_count,
low_rank != root_low_rank, NULL, completed); low_rank != root_low_rank, NULL, completed);
@ -208,35 +182,51 @@ mca_coll_han_allreduce_intra(const void *sbuf,
init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t);
issue_task(t3); issue_task(t3);
} }
if (t->completed != NULL) { free(t->completed);
free(t->completed); t->completed = NULL;
t->completed = NULL;
}
free(t); free(t);
return OMPI_SUCCESS; return OMPI_SUCCESS;
prev_allreduce_intra:
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
comm, han_module->previous_allreduce_module);
} }
/* t0 task */ /* t0 task */
int mca_coll_han_allreduce_t0_task(void *task_argu) int mca_coll_han_allreduce_t0_task(void *task_args)
{ {
mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg, "[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0])); ((int *) t->rbuf)[0]));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb; ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent); ompi_datatype_get_extent(t->dtype, &lb, &extent);
t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, if (MPI_IN_PLACE == t->sbuf) {
t->op, t->root_low_rank, t->low_comm, if (!t->noop) {
t->low_comm->c_coll->coll_reduce_module); t->low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
else {
t->low_comm->c_coll->coll_reduce((char *) t->rbuf, NULL, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
}
else {
t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* t1 task */ /* t1 task */
int mca_coll_han_allreduce_t1_task(void *task_argu) int mca_coll_han_allreduce_t1_task(void *task_args)
{ {
mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg, "[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0])); ((int *) t->rbuf)[0]));
@ -270,16 +260,16 @@ int mca_coll_han_allreduce_t1_task(void *task_argu)
} }
if (!t->noop) { if (!t->noop) {
ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE);
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* t2 task */ /* t2 task */
int mca_coll_han_allreduce_t2_task(void *task_argu) int mca_coll_han_allreduce_t2_task(void *task_args)
{ {
mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg, "[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0])); ((int *) t->rbuf)[0]));
@ -336,9 +326,9 @@ int mca_coll_han_allreduce_t2_task(void *task_argu)
} }
/* t3 task */ /* t3 task */
int mca_coll_han_allreduce_t3_task(void *task_argu) int mca_coll_han_allreduce_t3_task(void *task_args)
{ {
mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg, "[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0])); ((int *) t->rbuf)[0]));
@ -408,12 +398,12 @@ int mca_coll_han_allreduce_t3_task(void *task_argu)
int int
mca_coll_han_allreduce_intra_simple(const void *sbuf, mca_coll_han_allreduce_intra_simple(const void *sbuf,
void *rbuf, void *rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module) mca_coll_base_module_t *module)
{ {
ompi_communicator_t *low_comm; ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm; ompi_communicator_t *up_comm;
@ -428,22 +418,43 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf,
// Fallback to another component if the op cannot commute // Fallback to another component if the op cannot commute
if (! ompi_op_is_commute(op)) { if (! ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this operation." "han cannot handle allreduce with this operation. Fall back on another component\n"));
"It need to fall back on another component\n"));
goto prev_allreduce; goto prev_allreduce;
} }
mca_coll_han_comm_create_new(comm, han_module); /* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
comm, comm->c_coll->coll_reduce_module);
}
low_comm = han_module->sub_comm[INTRA_NODE]; low_comm = han_module->sub_comm[INTRA_NODE];
up_comm = han_module->sub_comm[INTER_NODE]; up_comm = han_module->sub_comm[INTER_NODE];
low_rank = ompi_comm_rank(low_comm); low_rank = ompi_comm_rank(low_comm);
/* Low_comm reduce */ /* Low_comm reduce */
ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, if (MPI_IN_PLACE == sbuf) {
if (low_rank == root_low_rank) {
ret = low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)rbuf,
count, dtype, op, root_low_rank, count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module); low_comm, low_comm->c_coll->coll_reduce_module);
}
else {
ret = low_comm->c_coll->coll_reduce((char *)rbuf, NULL,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
}
}
else {
ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output, OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: low comm reduce failed. " "HAN/ALLREDUCE: low comm reduce failed. "
@ -480,9 +491,9 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf,
return OMPI_SUCCESS; return OMPI_SUCCESS;
prev_allreduce: prev_allreduce:
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
han_module->previous_allreduce_module); comm, han_module->previous_allreduce_module);
} }
/* Find a fallback on reproducible algorithm /* Find a fallback on reproducible algorithm
@ -504,15 +515,14 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
int i; int i;
for (i=0; i<fallbacks_len; i++) { for (i=0; i<fallbacks_len; i++) {
int fallback = fallbacks[i]; int fallback = fallbacks[i];
mca_coll_base_module_t *fallback_module = han_module->modules_storage mca_coll_base_module_t *fallback_module
.modules[fallback] = han_module->modules_storage.modules[fallback].module_handler;
.module_handler;
if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) {
if (0 == w_rank) { if (0 == w_rank) {
opal_output_verbose(30, mca_coll_han_component.han_output, opal_output_verbose(30, mca_coll_han_component.han_output,
"coll:han:allreduce_reproducible: " "coll:han:allreduce_reproducible: "
"fallback on %s\n", "fallback on %s\n",
components_name[fallback]); available_components[fallback].component_name);
} }
han_module->reproducible_allreduce_module = fallback_module; han_module->reproducible_allreduce_module = fallback_module;
han_module->reproducible_allreduce = fallback_module->coll_allreduce; han_module->reproducible_allreduce = fallback_module->coll_allreduce;
@ -525,8 +535,7 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
"coll:han:allreduce_reproducible_decision: " "coll:han:allreduce_reproducible_decision: "
"no reproducible fallback\n"); "no reproducible fallback\n");
} }
han_module->reproducible_allreduce_module = han_module->reproducible_allreduce_module = han_module->previous_allreduce_module;
han_module->previous_allreduce_module;
han_module->reproducible_allreduce = han_module->previous_allreduce; han_module->reproducible_allreduce = han_module->previous_allreduce;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -16,31 +16,35 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, static int mca_coll_han_bcast_t0_task(void *task_args);
int seg_count, struct ompi_datatype_t *dtype, static int mca_coll_han_bcast_t1_task(void *task_args);
int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm, static inline void
struct ompi_communicator_t *low_comm, mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t * cur_task, void *buff,
int num_segments, int cur_seg, int w_rank, int last_seg_count, int seg_count, struct ompi_datatype_t *dtype,
bool noop) int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments, int cur_seg, int w_rank, int last_seg_count,
bool noop)
{ {
argu->cur_task = cur_task; args->cur_task = cur_task;
argu->buff = buff; args->buff = buff;
argu->seg_count = seg_count; args->seg_count = seg_count;
argu->dtype = dtype; args->dtype = dtype;
argu->root_low_rank = root_low_rank; args->root_low_rank = root_low_rank;
argu->root_up_rank = root_up_rank; args->root_up_rank = root_up_rank;
argu->up_comm = up_comm; args->up_comm = up_comm;
argu->low_comm = low_comm; args->low_comm = low_comm;
argu->num_segments = num_segments; args->num_segments = num_segments;
argu->cur_seg = cur_seg; args->cur_seg = cur_seg;
argu->w_rank = w_rank; args->w_rank = w_rank;
argu->last_seg_count = last_seg_count; args->last_seg_count = last_seg_count;
argu->noop = noop; args->noop = noop;
} }
/* /*
* Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast:
* ub: upper level (inter-node) bcast * ub: upper level (inter-node) bcast
* lb: low level (shared-memory or intra-node) bcast. * lb: low level (shared-memory or intra-node) bcast.
* Hence, in each iteration, there is a combination of collective operations which is called a task. * Hence, in each iteration, there is a combination of collective operations which is called a task.
@ -58,82 +62,57 @@ mca_coll_han_bcast_intra(void *buff,
int root, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module) struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{ {
ptrdiff_t extent, lb;
ompi_datatype_get_extent(dtype, &lb, &extent);
int w_rank;
w_rank = ompi_comm_rank(comm);
int seg_count = count;
size_t typelng;
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
int err, seg_count = count, w_rank = ompi_comm_rank(comm);
ompi_communicator_t *low_comm, *up_comm;
ptrdiff_t extent, lb;
size_t dtype_size;
/* Create the subcommunicators */
err = mca_coll_han_comm_create(comm, han_module);
if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
/* Topo must be initialized to know rank distribution which then is used to /* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */ * determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2); mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. It need to fall back on another component\n")); "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n"));
return han_module->previous_bcast(buff, count, dtype, root, /* Put back the fallback collective support and call it once. All
comm, han_module->previous_bcast_module); * future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
} }
ompi_datatype_type_size(dtype, &typelng); ompi_datatype_get_extent(dtype, &lb, &extent);
ompi_datatype_type_size(dtype, &dtype_size);
/* Create the subcommunicators */ /* use MCA parameters for now */
mca_coll_han_comm_create(comm, han_module); low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module];
ompi_communicator_t *low_comm; up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module];
ompi_communicator_t *up_comm; COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, dtype_size,
/* Auto tune is enabled */ seg_count);
if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) {
uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0]));
uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0]));
uint32_t m = han_auto_tuned_get_m(typelng * count);
uint32_t id =
n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m +
c * mca_coll_han_component.han_auto_tune_m + m;
uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod;
uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod;
uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs;
/* ualg and us are only available when using ADAPT */
/*
uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg;
uint32_t us = mca_coll_han_component.han_auto_tuned[id].us;
*/
/* Set up umod */
up_comm = han_module->cached_up_comms[umod];
/* Set up lmod */
low_comm = han_module->cached_low_comms[lmod];
/* Set up fs */
COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count);
/* Set up ualg and us, which is only available when using ADAPT */
/*
if (umod == 1) {
((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component->
adapt_ibcast_algorithm = ualg;
((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component->
adapt_ibcast_segment_size = us;
}
*/
} else {
/* If auto tune is disabled, use MCA parameters */
low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module];
up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module];
COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, typelng,
seg_count);
}
int num_segments = (count + seg_count - 1) / seg_count; int num_segments = (count + seg_count - 1) / seg_count;
OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output,
"In HAN seg_count %d count %d num_seg %d\n", "In HAN seg_count %d count %d num_seg %d\n",
seg_count, count, num_segments)); seg_count, count, num_segments));
int *vranks = han_module->cached_vranks; int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm); int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm); int low_size = ompi_comm_size(low_comm);
int root_low_rank; int root_low_rank, root_up_rank;
int root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank,
@ -142,8 +121,8 @@ mca_coll_han_bcast_intra(void *buff,
/* Create t0 tasks for the first segment */ /* Create t0 tasks for the first segment */
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
/* Setup up t0 task arguments */ /* Setup up t0 task arguments */
mca_bcast_argu_t *t = malloc(sizeof(mca_bcast_argu_t)); mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t));
mac_coll_han_set_bcast_argu(t, t0, (char *) buff, seg_count, dtype, mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype,
root_up_rank, root_low_rank, up_comm, low_comm, root_up_rank, root_low_rank, up_comm, low_comm,
num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count,
low_rank != root_low_rank); low_rank != root_low_rank);
@ -161,9 +140,7 @@ mca_coll_han_bcast_intra(void *buff,
while (t->cur_seg <= t->num_segments - 2) { while (t->cur_seg <= t->num_segments - 2) {
/* Create t1 task */ /* Create t1 task */
mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); t->cur_task = t1 = OBJ_NEW(mca_coll_task_t);
/* Setup up t1 task arguments */
t->cur_task = t1;
t->buff = (char *) t->buff + extent * seg_count; t->buff = (char *) t->buff + extent * seg_count;
t->cur_seg = t->cur_seg + 1; t->cur_seg = t->cur_seg + 1;
/* Init the t1 task */ /* Init the t1 task */
@ -177,43 +154,40 @@ mca_coll_han_bcast_intra(void *buff,
} }
/* t0 task: issue and wait for the upper level ibcast of segment 0 */ /* t0 task: issue and wait for the upper level ibcast of segment 0 */
int mca_coll_han_bcast_t0_task(void *task_argu) int mca_coll_han_bcast_t0_task(void *task_args)
{ {
mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank,
t->cur_seg)); t->cur_seg));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
if (t->noop) { if (t->noop) {
return OMPI_SUCCESS; return OMPI_SUCCESS;
} else {
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *ibcast_req;
t->up_comm->c_coll->coll_ibcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank,
t->up_comm, &ibcast_req, t->up_comm->c_coll->coll_ibcast_module);
ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE);
return OMPI_SUCCESS;
} }
t->up_comm->c_coll->coll_bcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank,
t->up_comm, t->up_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS;
} }
/* t1 task: /* t1 task:
* 1. issue the upper level ibcast of segment cur_seg + 1 * 1. issue the upper level ibcast of segment cur_seg + 1
* 2. issue the low level bcast of segment cur_seg * 2. issue the low level bcast of segment cur_seg
* 3. wait for the completion of the ibcast * 3. wait for the completion of the ibcast
*/ */
int mca_coll_han_bcast_t1_task(void *task_argu) int mca_coll_han_bcast_t1_task(void *task_args)
{ {
mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args;
ompi_request_t *ibcast_req = NULL;
int tmp_count = t->seg_count;
ptrdiff_t extent, lb;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank,
t->cur_seg)); t->cur_seg));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent); ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *ibcast_req = NULL;
int tmp_count = t->seg_count;
if (!t->noop) { if (!t->noop) {
if (t->cur_seg <= t->num_segments - 2 ) { if (t->cur_seg <= t->num_segments - 2 ) {
if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { if (t->cur_seg == t->num_segments - 2) {
tmp_count = t->last_seg_count; tmp_count = t->last_seg_count;
} }
t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count, t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count,
@ -223,12 +197,14 @@ int mca_coll_han_bcast_t1_task(void *task_argu)
} }
} }
/* are we the last segment to be pushed downstream ? */
tmp_count = (t->cur_seg == (t->num_segments - 1)) ? t->last_seg_count : t->seg_count;
t->low_comm->c_coll->coll_bcast((char *) t->buff, t->low_comm->c_coll->coll_bcast((char *) t->buff,
t->seg_count, t->dtype, t->root_low_rank, t->low_comm, tmp_count, t->dtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_bcast_module); t->low_comm->c_coll->coll_bcast_module);
if (!t->noop && ibcast_req != NULL) { if (NULL != ibcast_req) {
ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); ompi_request_wait(&ibcast_req, MPI_STATUS_IGNORE);
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -242,51 +218,64 @@ mca_coll_han_bcast_intra_simple(void *buff,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module) mca_coll_base_module_t *module)
{ {
int w_rank;
w_rank = ompi_comm_rank(comm);
/* create the subcommunicators */ /* create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
mca_coll_han_comm_create_new(comm, han_module); ompi_communicator_t *low_comm, *up_comm;
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; int err, w_rank = ompi_comm_rank(comm);
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
/* Create the subcommunicators */
err = mca_coll_han_comm_create_new(comm, han_module);
if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
low_comm = han_module->sub_comm[INTRA_NODE];
up_comm = han_module->sub_comm[INTER_NODE];
int *vranks = han_module->cached_vranks; int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm); int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm); int low_size = ompi_comm_size(low_comm);
int root_low_rank; int root_low_rank, root_up_rank;
int root_up_rank;
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. It need to fall back on another component\n"));
return han_module->previous_bcast(buff, count, dtype, root,
comm, han_module->previous_bcast_module);
} else {
OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output,
"[OMPI][han] in mca_coll_han_bcast_intra_simple\n"));
}
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: root_low_rank %d root_up_rank %d\n", "[%d]: root_low_rank %d root_up_rank %d\n",
w_rank, root_low_rank, root_up_rank)); w_rank, root_low_rank, root_up_rank));
if (low_rank == root_low_rank) { if (low_rank == root_low_rank) {
up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module); up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank,
up_comm, up_comm->c_coll->coll_bcast_module);
/* To remove when han has better sub-module selection. /* To remove when han has better sub-module selection.
For now switching to ibcast enables to make runs with libnbc. */ For now switching to ibcast enables to make runs with libnbc. */
//ompi_request_t req; //ompi_request_t req;
//up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module); //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank,
// up_comm, &req, up_comm->c_coll->coll_ibcast_module);
//ompi_request_wait(&req, MPI_STATUS_IGNORE); //ompi_request_wait(&req, MPI_STATUS_IGNORE);
} }
low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank,
low_comm, low_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -25,13 +25,24 @@
#include "coll_han.h" #include "coll_han.h"
#include "coll_han_dynamic.h" #include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h" #include "coll_han_dynamic_file.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* /*
* Public string showing the coll ompi_han component version number * Public string showing the coll ompi_han component version number
*/ */
const char *mca_coll_han_component_version_string = const char *mca_coll_han_component_version_string =
"Open MPI han collective MCA component version " OMPI_VERSION; "Open MPI HAN collective MCA component version " OMPI_VERSION;
ompi_coll_han_components available_components[COMPONENTS_COUNT] = {
{ SELF, "self", NULL },
{ BASIC, "basic", NULL },
{ LIBNBC, "libnbc", NULL },
{ TUNED, "tuned", NULL },
{ SM, "sm", NULL },
{ SHARED, "shared", NULL },
{ ADAPT, "adapt", NULL },
{ HAN, "han", NULL }
};
/* /*
* Local functions * Local functions
@ -46,35 +57,33 @@ static int han_register(void);
*/ */
mca_coll_han_component_t mca_coll_han_component = { mca_coll_han_component_t mca_coll_han_component = {
/* First, fill in the super */ /* First, fill in the super */
{ {
/* First, the mca_component_t struct containing meta /* First, the mca_component_t struct containing meta
information about the component itself */ information about the component itself */
.collm_version = { .collm_version = {
MCA_COLL_BASE_VERSION_2_0_0, MCA_COLL_BASE_VERSION_2_0_0,
/* Component name and version */ /* Component name and version */
.mca_component_name = "han", .mca_component_name = "han",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION), OMPI_RELEASE_VERSION),
/* Component functions */ /* Component functions */
.mca_open_component = han_open, .mca_open_component = han_open,
.mca_close_component = han_close, .mca_close_component = han_close,
.mca_register_component_params = han_register, .mca_register_component_params = han_register,
}, },
.collm_data = { .collm_data = {
/* The component is not checkpoint ready */ /* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE}, MCA_BASE_METADATA_PARAM_NONE},
/* Initialization / querying functions */ /* Initialization / querying functions */
.collm_init_query = mca_coll_han_init_query, .collm_init_query = mca_coll_han_init_query,
.collm_comm_query = mca_coll_han_comm_query, .collm_comm_query = mca_coll_han_comm_query,
}, },
/* han-component specifc information */ /* han-component specifc information */
@ -87,27 +96,9 @@ mca_coll_han_component_t mca_coll_han_component = {
*/ */
static int han_open(void) static int han_open(void)
{ {
int param; /* Get the global coll verbosity: it will be ours */
mca_coll_han_component_t *cs = &mca_coll_han_component; mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output;
if (cs->han_auto_tune) {
cs->han_auto_tuned =
(selection *) malloc(2 * cs->han_auto_tune_n * cs->han_auto_tune_c *
cs->han_auto_tune_m * sizeof(selection));
char *filename = "/home/dycz0fx/results/auto/auto_tuned.bin";
FILE *file = fopen(filename, "r");
fread(cs->han_auto_tuned, sizeof(selection),
2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file);
fclose(file);
}
/*
* Get the global coll verbosity: it will be ours
*/
cs->han_output = ompi_coll_base_framework.framework_output;
opal_output_verbose(1, cs->han_output,
"coll:han:component_open: done!");
cs->topo_level = GLOBAL_COMMUNICATOR;
return mca_coll_han_init_dynamic_rules(); return mca_coll_han_init_dynamic_rules();
} }
@ -117,11 +108,6 @@ static int han_open(void)
*/ */
static int han_close(void) static int han_close(void)
{ {
mca_coll_han_component_t *cs = &mca_coll_han_component;
if (cs->han_auto_tune && cs->han_auto_tuned != NULL) {
free(cs->han_auto_tuned);
cs->han_auto_tuned = NULL;
}
mca_coll_han_free_dynamic_rules(); mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -154,57 +140,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl)
return "invalid topologic level"; return "invalid topologic level";
} }
} }
const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll)
{
switch(coll) {
case ALLGATHER:
return "allgather";
case ALLGATHERV:
return "allgatherv";
case ALLREDUCE:
return "allreduce";
case ALLTOALL:
return "alltoall";
case ALLTOALLV:
return "alltoallv";
case ALLTOALLW:
return "alltoallw";
case BARRIER:
return "barrier";
case BCAST:
return "bcast";
case EXSCAN:
return "exscan";
case GATHER:
return "gather";
case GATHERV:
return "gatherv";
case REDUCE:
return "reduce";
case REDUCESCATTER:
return "reduce_scatter";
case REDUCESCATTERBLOCK:
return "reduce_scatter_block";
case SCAN:
return "scan";
case SCATTER:
return "scatter";
case SCATTERV:
return "scatterv";
case NEIGHBOR_ALLGATHER:
return "neighbor_allgather";
case NEIGHBOR_ALLGATHERV:
return "neighbor_allgatherv";
case NEIGHBOR_ALLTOALL:
return "neighbor_alltoall";
case NEIGHBOR_ALLTOALLV:
return "neighbor_alltoallv";
case NEIGHBOR_ALLTOALLW:
return "neighbor_alltoallw";
default:
return "";
}
}
/* /*
* Register MCA params * Register MCA params
@ -215,15 +151,14 @@ static int han_register(void)
mca_coll_han_component_t *cs = &mca_coll_han_component; mca_coll_han_component_t *cs = &mca_coll_han_component;
/* Generated parameters name and description */ /* Generated parameters name and description */
char param_name[100] = ""; char param_name[128], param_desc[256];
char param_desc[300] = "";
int param_desc_size; int param_desc_size;
COLLTYPE_T coll; COLLTYPE_T coll;
TOPO_LVL_T topo_lvl; TOPO_LVL_T topo_lvl;
COMPONENT_T component; COMPONENT_T component;
cs->han_priority = 0; cs->han_priority = 0;
(void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority);
@ -261,16 +196,14 @@ static int han_register(void)
"up level module for allreduce, 0 libnbc, 1 adapt", "up level module for allreduce, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_up_module);
&cs->han_reduce_up_module);
cs->han_reduce_low_module = 0; cs->han_reduce_low_module = 0;
(void) mca_base_component_var_register(c, "reduce_low_module", (void) mca_base_component_var_register(c, "reduce_low_module",
"low level module for allreduce, 0 sm, 1 shared", "low level module for allreduce, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module);
&cs->han_reduce_low_module);
cs->han_allreduce_segsize = 524288; cs->han_allreduce_segsize = 524288;
(void) mca_base_component_var_register(c, "allreduce_segsize", (void) mca_base_component_var_register(c, "allreduce_segsize",
"segment size for allreduce", "segment size for allreduce",
@ -283,32 +216,28 @@ static int han_register(void)
"up level module for allreduce, 0 libnbc, 1 adapt", "up level module for allreduce, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_up_module);
&cs->han_allreduce_up_module);
cs->han_allreduce_low_module = 0; cs->han_allreduce_low_module = 0;
(void) mca_base_component_var_register(c, "allreduce_low_module", (void) mca_base_component_var_register(c, "allreduce_low_module",
"low level module for allreduce, 0 sm, 1 shared", "low level module for allreduce, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module);
&cs->han_allreduce_low_module);
cs->han_allgather_up_module = 0; cs->han_allgather_up_module = 0;
(void) mca_base_component_var_register(c, "allgather_up_module", (void) mca_base_component_var_register(c, "allgather_up_module",
"up level module for allgather, 0 libnbc, 1 adapt", "up level module for allgather, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_up_module);
&cs->han_allgather_up_module);
cs->han_allgather_low_module = 0; cs->han_allgather_low_module = 0;
(void) mca_base_component_var_register(c, "allgather_low_module", (void) mca_base_component_var_register(c, "allgather_low_module",
"low level module for allgather, 0 sm, 1 shared", "low level module for allgather, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module);
&cs->han_allgather_low_module);
cs->han_gather_up_module = 0; cs->han_gather_up_module = 0;
(void) mca_base_component_var_register(c, "gather_up_module", (void) mca_base_component_var_register(c, "gather_up_module",
@ -336,15 +265,7 @@ static int han_register(void)
"low level module for scatter, 0 sm, 1 shared", "low level module for scatter, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module);
&cs->han_scatter_low_module);
cs->han_auto_tune = 0;
(void) mca_base_component_var_register(c, "auto_tune",
"whether enable auto tune, 0 disable, 1 enable, default 0",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune);
cs->han_reproducible = 0; cs->han_reproducible = 0;
(void) mca_base_component_var_register(c, "reproducible", (void) mca_base_component_var_register(c, "reproducible",
@ -353,17 +274,15 @@ static int han_register(void)
"0 disable 1 enable, default 0", "0 disable 1 enable, default 0",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_3, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible);
&cs->han_reproducible);
/* Simple algorithms MCA parameters */ /* Simple algorithms MCA parameters */
for(coll = 0 ; coll < COLLCOUNT ; coll++) { for(coll = 0 ; coll < COLLCOUNT ; coll++) {
cs->use_simple_algorithm[coll] = false; cs->use_simple_algorithm[coll] = false;
if(is_simple_implemented(coll)) { if(is_simple_implemented(coll)) {
snprintf(param_name, 100, "use_simple_%s", snprintf(param_name, sizeof(param_name), "use_simple_%s",
mca_coll_han_colltype_to_str(coll)); mca_coll_base_colltype_to_str(coll));
snprintf(param_desc, 300, "whether to enable simple algo for %s", snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s",
mca_coll_han_colltype_to_str(coll)); mca_coll_base_colltype_to_str(coll));
mca_base_component_var_register(c, param_name, mca_base_component_var_register(c, param_name,
param_desc, param_desc,
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
@ -374,31 +293,28 @@ static int han_register(void)
} }
/* Dynamic rules MCA parameters */ /* Dynamic rules MCA parameters */
/* TODO: Find a way to avoid unused entried */
memset(cs->mca_rules, 0, memset(cs->mca_rules, 0,
COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T));
for(coll = 0 ; coll < COLLCOUNT ; coll++) { for(coll = 0; coll < COLLCOUNT; coll++) {
if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { if(!mca_coll_han_is_coll_dynamic_implemented(coll)) {
continue; continue;
} }
/* /*
* Default values * Default values
* Do not avoid to set correct default parameters
*/ */
cs->mca_rules[coll][INTRA_NODE] = TUNED; cs->mca_rules[coll][INTRA_NODE] = TUNED;
cs->mca_rules[coll][INTER_NODE] = BASIC; cs->mca_rules[coll][INTER_NODE] = BASIC;
cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN;
for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) { for(topo_lvl = 0; topo_lvl < NB_TOPO_LVL; topo_lvl++) {
snprintf(param_name, 100, "%s_dynamic_%s_module", snprintf(param_name, sizeof(param_name), "%s_dynamic_%s_module",
mca_coll_han_colltype_to_str(coll), mca_coll_base_colltype_to_str(coll),
mca_coll_han_topo_lvl_to_str(topo_lvl)); mca_coll_han_topo_lvl_to_str(topo_lvl));
param_desc_size = snprintf(param_desc, 300, param_desc_size = snprintf(param_desc, sizeof(param_desc),
"Collective module to use for " "Collective module to use for %s on %s topological level: ",
"collective %s on %s topological level: ", mca_coll_base_colltype_to_str(coll),
mca_coll_han_colltype_to_str(coll),
mca_coll_han_topo_lvl_to_str(topo_lvl)); mca_coll_han_topo_lvl_to_str(topo_lvl));
/* /*
* Exhaustive description: * Exhaustive description:
@ -410,10 +326,10 @@ static int han_register(void)
/* Han can only be used on the global communicator */ /* Han can only be used on the global communicator */
continue; continue;
} }
param_desc_size += snprintf(param_desc+param_desc_size, 300, param_desc_size += snprintf(param_desc+param_desc_size, sizeof(param_desc) - param_desc_size,
"%d = %s; ", "%d = %s; ",
component, component,
components_name[component]); available_components[component].component_name);
} }
mca_base_component_var_register(c, param_name, param_desc, mca_base_component_var_register(c, param_name, param_desc,
@ -424,45 +340,11 @@ static int han_register(void)
} }
} }
/*
* TODO: remove the following lines when auto-tune is added back to the code
*/
cs->han_auto_tune = 0;
cs->han_auto_tune_n = 5;
cs->han_auto_tune_c = 3;
cs->han_auto_tune_m = 21;
#if 0
cs->han_auto_tune_n = 5;
(void) mca_base_component_var_register(c, "auto_tune_n",
"auto tune n",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_n);
cs->han_auto_tune_c = 3;
(void) mca_base_component_var_register(c, "auto_tune_c",
"auto tune c",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_c);
cs->han_auto_tune_m = 21;
(void) mca_base_component_var_register(c, "auto_tune_m",
"auto tune n",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&cs->han_auto_tune_m);
#endif
/* Dynamic rules */ /* Dynamic rules */
cs->use_dynamic_file_rules = false; cs->use_dynamic_file_rules = false;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"use_dynamic_file_rules", "use_dynamic_file_rules",
"Switch used to decide if we use " "Enable the dynamic selection provided via the dynamic_rules_filename MCA",
"dynamic module choice rules "
"defines by file",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_6, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
@ -471,8 +353,7 @@ static int han_register(void)
cs->dynamic_rules_filename = NULL; cs->dynamic_rules_filename = NULL;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"dynamic_rules_filename", "dynamic_rules_filename",
"Filename of configuration file that " "Configuration file containing the dynamic selection rules",
"contains the dynamic module choice rules",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_6, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
@ -481,9 +362,7 @@ static int han_register(void)
cs->dump_dynamic_rules = false; cs->dump_dynamic_rules = false;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"dump_dynamic_rules", "dump_dynamic_rules",
"Switch used to decide if we dump " "Switch used to decide if we dump dynamic rules provided by configuration file",
"dynamic rules provided by "
"configuration file",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_6, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
@ -492,11 +371,8 @@ static int han_register(void)
if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename)
&& !cs->use_dynamic_file_rules) { && !cs->use_dynamic_file_rules) {
opal_output_verbose(0, cs->han_output, opal_output_verbose(0, cs->han_output,
"coll:han:han_register " "HAN: dynamic rules for collectives are hot activated."
"you asked for dynamic rules " "Check coll_han_use_dynamic_file_rules MCA parameter");
"but they are not activated. "
"Check coll_han_use_dynamic_file_rules "
"MCA parameter");
} }
cs->max_dynamic_errors = 10; cs->max_dynamic_errors = 10;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,8 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
@ -27,9 +30,9 @@
* ################################################# * #################################################
* *
* Han dynamic rules allow the user to define the collective * Han dynamic rules allow the user to define the collective
* module to call depending the topological configuration of the * module to call depending on the topological configuration of the
* sub-communicators and the collective parameters. This mechanism * sub-communicators and the collective parameters. This mechanism
* can also be used to fallback the main collective on another module. * can also be used to fallback to the main collective on another module.
* The interface is described in coll_han_dynamic_file.h. * The interface is described in coll_han_dynamic_file.h.
* *
* ############################# * #############################
@ -39,7 +42,7 @@
* directly accesses the module on the communicator. This information is * directly accesses the module on the communicator. This information is
* stored in the collective structure of the communicator during the collective * stored in the collective structure of the communicator during the collective
* module choice at the communicator initialization. When han needs this * module choice at the communicator initialization. When han needs this
* information for the first time, it identifies the modles by their name and * information for the first time, it identifies the modules by their name and
* stores them in its module structure. * stores them in its module structure.
* Then, the modules are identified by their identifier. * Then, the modules are identified by their identifier.
* *
@ -69,7 +72,7 @@
* adds an indirection on the collective call: dynamic choice functions. These * adds an indirection on the collective call: dynamic choice functions. These
* functions do not implement any collective. First, they try to find a dynamic * functions do not implement any collective. First, they try to find a dynamic
* rule from file for the given collective. If there is not any rule for the * rule from file for the given collective. If there is not any rule for the
* fiven configuration, MCA parameter defined rules are used. Once the module * given configuration, MCA parameter defined rules are used. Once the module
* to use is found, the correct collective implementation is called. * to use is found, the correct collective implementation is called.
* *
* This indirection is also used on the global communicator. This allows han * This indirection is also used on the global communicator. This allows han
@ -92,11 +95,9 @@
* by increasing value, some of them will not be considered * by increasing value, some of them will not be considered
*/ */
BEGIN_C_DECLS
/* Dynamic rules support */ /* Dynamic rules support */
typedef enum COMPONENTS { typedef enum COMPONENTS {
SELF=0, SELF = 0,
BASIC, BASIC,
LIBNBC, LIBNBC,
TUNED, TUNED,
@ -107,18 +108,17 @@ typedef enum COMPONENTS {
COMPONENTS_COUNT COMPONENTS_COUNT
} COMPONENT_T; } COMPONENT_T;
static const char *components_name[]={"self", typedef struct {
"basic", COMPONENT_T id;
"libnbc", char* component_name;
"tuned", mca_coll_base_component_t* component;
"sm", } ompi_coll_han_components;
"shared",
"adapt", extern ompi_coll_han_components available_components[COMPONENTS_COUNT];
"han"};
/* Topologic levels */ /* Topologic levels */
typedef enum TOPO_LVL { typedef enum TOPO_LVL {
INTRA_NODE=0, INTRA_NODE = 0,
INTER_NODE, INTER_NODE,
/* Identifies the global communicator as a topologic level */ /* Identifies the global communicator as a topologic level */
GLOBAL_COMMUNICATOR, GLOBAL_COMMUNICATOR,
@ -135,7 +135,7 @@ typedef struct msg_size_rule_s {
int configuration_size; int configuration_size;
/* Message size of the rule */ /* Message size of the rule */
int msg_size; size_t msg_size;
/* Component to use on this specific configuration /* Component to use on this specific configuration
* and message size */ * and message size */
@ -209,6 +209,6 @@ typedef struct mca_coll_han_collective_modules_storage_s {
/* Tests if a dynamic collective is implemented */ /* Tests if a dynamic collective is implemented */
bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id);
COMPONENT_T mca_coll_han_component_name_to_id(const char* name);
END_C_DECLS
#endif #endif

Просмотреть файл

@ -26,11 +26,14 @@
#include "ompi/mca/coll/base/coll_base_util.h" #include "ompi/mca/coll/base/coll_base_util.h"
#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval)
#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval)
#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval)
static void check_dynamic_rules(void); static void check_dynamic_rules(void);
/* Current file line for verbose message */ /* Current file line for verbose message */
static int fileline = 1; static int fileline = 1;
#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline)
int int
mca_coll_han_init_dynamic_rules(void) mca_coll_han_init_dynamic_rules(void)
@ -38,31 +41,31 @@ mca_coll_han_init_dynamic_rules(void)
/* File management */ /* File management */
const char *fname; const char *fname;
FILE *fptr = NULL; FILE *fptr = NULL;
int nb_entries = 0; int nb_entries = 0, rc;
/* Loop counters */ /* Loop counters */
int i, j, k, l; int i, j, k, l;
/* Collective informations */ /* Collective informations */
int nb_coll; long nb_coll, coll_id;
COLLTYPE_T coll_id; char * coll_name = NULL;
collective_rule_t *coll_rules; collective_rule_t *coll_rules;
/* Topo informations */ /* Topo informations */
int nb_topo; long nb_topo, topo_lvl;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules; topologic_rule_t *topo_rules;
/* Configuration informations */ /* Configuration informations */
int nb_rules, conf_size; long nb_rules, conf_size;
configuration_rule_t *conf_rules; configuration_rule_t *conf_rules;
/* Message size informations */ /* Message size informations */
int nb_msg_size, msg_size; long nb_msg_size;
size_t msg_size;
msg_size_rule_t *msg_size_rules; msg_size_rule_t *msg_size_rules;
/* Component informations */ /* Component informations */
COMPONENT_T component; long component;
/* If the dynamic rules are not used, do not even read the file */ /* If the dynamic rules are not used, do not even read the file */
if(!mca_coll_han_component.use_dynamic_file_rules) { if(!mca_coll_han_component.use_dynamic_file_rules) {
@ -70,47 +73,31 @@ mca_coll_han_init_dynamic_rules(void)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
fname = mca_coll_han_component.dynamic_rules_filename; if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) {
if(NULL == fname) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but "
"coll_han_use_dynamic_file_rules is true but " "coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n");
"coll_han_dynamic_rules_filename is not set: "
"coll han will use dynamic rules from mca "
"parameters and their default value\n");
mca_coll_han_component.dynamic_rules.nb_collectives = 0; mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
fptr = fopen(fname, "r"); if( NULL == (fptr = fopen(fname, "r")) ) {
if(NULL == fptr) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by "
"cannot open dynamic file provided by " "coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and "
"coll_han_dynamic_rules_filename=%s " "check file permissions. Rules from MCA parameters will be used instead\n",
"please provide it with full path and "
"check file permissions. Rules from "
"MCA parameters will be used instead\n",
fname); fname);
mca_coll_han_component.dynamic_rules.nb_collectives = 0; mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* The first information of the file is the collective count */ /* The first information of the file is the collective count */
nb_coll = getnext(fptr); if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) {
if(nb_coll <= 0) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: an invalid value %ld is given for collective count "
"at line %d: an invalid value %d is given "
"for collective count "
"or the reader encountered an unexpected EOF\n", "or the reader encountered an unexpected EOF\n",
fname, fname, fileline, nb_coll);
fileline,
nb_coll);
mca_coll_han_component.dynamic_rules.nb_collectives = 0; mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto file_reading_error; goto file_reading_error;
} }
@ -126,69 +113,65 @@ mca_coll_han_init_dynamic_rules(void)
} }
/* Iterates on collective rules */ /* Iterates on collective rules */
for(i=0 ; i<nb_coll ; i++) { for( i = 0 ; i < nb_coll ; i++ ) {
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
/* Get the collective identifier */ /* Get the collective identifier */
coll_id = getnext(fptr); if( getnext_string(fptr, &coll_name) < 0 ) {
if(coll_id < ALLGATHER || coll_id >= COLLCOUNT) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d."
"invalid collective id %d at line %d: the collective " "The rest of the input file will be ignored.\n",
"must be at least %d and less than %d\n", fileline);
coll_id,
fileline,
ALLGATHER,
COLLCOUNT);
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
coll_id = mca_coll_base_name_to_colltype(coll_name);
if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) {
/* maybe the file was in the old format and we read the collective index instead of the name. */
char* endp;
coll_id = strtol(coll_name, &endp, 10);
if( '\0' != *endp ) { /* there is garbage in the input */
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid collective %s "
"at line %d: the collective must be at least %d and less than %d. "
"The rest of the input file will be ignored.\n",
coll_name, fileline, ALLGATHER, COLLCOUNT);
goto file_reading_error;
}
free(coll_name);
coll_name = mca_coll_base_colltype_to_str(coll_id);
}
if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "read collective id %ld at line %d but this collective is not implemented yet. "
"read collective id %d at line %d " "This is not an error but this set of rules will not be used\n",
"but this collective is not implemented yet. " fname, coll_id, fileline);
"This is not an error but this set of rules "
"will not be used\n",
fname,
coll_id,
fileline);
} }
/* /*
* The first information of a collective rule * The first information of a collective rule
* is the number of topologic rules * is the number of topologic rules
*/ */
nb_topo = getnext(fptr); if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) {
if(nb_topo < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: an invalid value %ld is given for topo level count "
"at line %d: an invalid value %d is given "
"for topo level count "
"or the reader encountered an unexpected EOF\n", "or the reader encountered an unexpected EOF\n",
fname, fname, fileline, nb_topo);
fileline,
nb_topo);
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
/* Store the collective rule informations */ /* Store the collective rule informations */
coll_rules[i].collective_id = coll_id;
coll_rules[i].nb_topologic_levels = nb_topo; coll_rules[i].nb_topologic_levels = nb_topo;
coll_rules[i].collective_id = (COLLTYPE_T)coll_id;
if(0 == nb_topo) { if(0 == nb_topo) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"Warning on dynamic rules file %s " "at line %d: an invalid value %ld is given for topo level count\n",
"at line %d: an invalid value %d is given " fname, fileline, nb_topo);
"for topo level count\n",
fname,
fileline,
nb_topo);
continue; continue;
} }
@ -197,30 +180,21 @@ mca_coll_han_init_dynamic_rules(void)
coll_rules[i].topologic_rules = topo_rules; coll_rules[i].topologic_rules = topo_rules;
if(NULL == topo_rules) { if(NULL == topo_rules) {
coll_rules[i].nb_topologic_levels = 0; coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate; goto cannot_allocate;
} }
/* Iterates on topologic rules */ /* Iterates on topologic rules */
for(j=0 ; j<nb_topo ; j++) { for( j = 0 ; j < nb_topo ; j++ ) {
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
/* Get the topologic level identifier */ /* Get the topologic level identifier */
topo_lvl = getnext(fptr); if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) {
if(topo_lvl < INTRA_NODE || topo_lvl >= NB_TOPO_LVL) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. "
"at line %d: an invalid topo level %d is given " "Topologic level must be at least %d and less than %d\n",
"or the reader encountered an unexpected EOF. " fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL);
"Topologic level must be at least %d and "
"less than %d\n",
fname,
fileline,
topo_lvl,
INTRA_NODE,
NB_TOPO_LVL);
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
@ -228,38 +202,26 @@ mca_coll_han_init_dynamic_rules(void)
* The first information of a topologic rule * The first information of a topologic rule
* is the number of configurations * is the number of configurations
*/ */
nb_rules = getnext(fptr); nb_rules = -1;
if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) {
if(nb_rules < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: an invalid value %ld is given for rules count "
"at line %d: an invalid value %d "
"is given for rules count "
"or the reader encountered an unexpected EOF\n", "or the reader encountered an unexpected EOF\n",
fname, fname, fileline, nb_rules);
fileline,
nb_rules);
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
/* Store the topologic rule informations */ /* Store the topologic rule informations */
topo_rules[j].collective_id = coll_id; topo_rules[j].collective_id = coll_id;
topo_rules[j].topologic_level = topo_lvl; topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl;
topo_rules[j].nb_rules = nb_rules; topo_rules[j].nb_rules = nb_rules;
if(0 == nb_rules) { if(0 == nb_rules) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"Warning on dynamic rules file %s " "at line %d: an invalid value %ld is given for configuration rules count\n",
"at line %d: an invalid value %d is given " fname, fileline, nb_rules);
"for configuration rules count\n",
fname,
fileline,
nb_rules);
continue; continue;
} }
@ -268,32 +230,21 @@ mca_coll_han_init_dynamic_rules(void)
topo_rules[j].configuration_rules = conf_rules; topo_rules[j].configuration_rules = conf_rules;
if(NULL == conf_rules) { if(NULL == conf_rules) {
topo_rules[j].nb_rules = 0; topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate; goto cannot_allocate;
} }
/* Iterate on configuration rules */ /* Iterate on configuration rules */
for(k=0 ; k<nb_rules ; k++) { for( k = 0; k < nb_rules; k++ ) {
/* Get the configuration size */ conf_rules[k].nb_msg_size = 0;
conf_size = getnext(fptr); topo_rules[j].nb_rules = k+1;
if(conf_size < 1 || (0 == k && conf_size > 1)) { /* Get the configuration size */
if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d "
"invalid configuration size %d at line %d " "or the reader encountered an unexpected EOF the configuration size must be at least %d "
"or the reader encountered an unexpected EOF " "and the first configuration size of a topologic level must be %d\n",
"the configuration size must be at least %d " conf_size, fileline, 1, 1);
"and the first configuration size "
"of a topologic level must be %d\n",
conf_size,
fileline,
1,
1);
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
@ -301,21 +252,12 @@ mca_coll_han_init_dynamic_rules(void)
* The first information of a configuration rule * The first information of a configuration rule
* is the number of message size rules * is the number of message size rules
*/ */
nb_msg_size = getnext(fptr); if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) {
if(nb_msg_size < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: an invalid value %ld is given for message size rules count "
"at line %d: an invalid value %d "
"is given for message size rules count "
"or the reader encountered an unexpected EOF\n", "or the reader encountered an unexpected EOF\n",
fname, fname, fileline, nb_msg_size);
fileline,
nb_msg_size);
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
@ -327,13 +269,9 @@ mca_coll_han_init_dynamic_rules(void)
if(0 == nb_msg_size) { if(0 == nb_msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"Warning on dynamic rules file %s " "at line %d: an invalid value %ld is given for message size rules count\n",
"at line %d: an invalid value %d is given " fname, fileline, nb_msg_size);
"for message size rules count\n",
fname,
fileline,
nb_msg_size);
continue; continue;
} }
@ -342,88 +280,99 @@ mca_coll_han_init_dynamic_rules(void)
conf_rules[k].msg_size_rules = msg_size_rules; conf_rules[k].msg_size_rules = msg_size_rules;
if(NULL == msg_size_rules) { if(NULL == msg_size_rules) {
conf_rules[k].nb_msg_size = 0; conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate; goto cannot_allocate;
} }
/* Iterate on message size rules */ /* Iterate on message size rules */
for(l=0 ; l<nb_msg_size ; l++) { for( l = 0; l < nb_msg_size; l++ ) {
char* target_comp_name = NULL;
conf_rules[k].nb_msg_size = l+1;
/* Get the message size */ /* Get the message size */
msg_size = getnext(fptr); rc = getnext_size_t(fptr, &msg_size);
if(msg_size < 0 if( (rc < 0) ||
|| (0 ==l && msg_size > 1)) { (0 == l && msg_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: an invalid value %" PRIsize_t " is given for message size "
"at line %d: an invalid value %d " "or the reader encountered an unexpected EOF. "
"is given for message size " "The first message size rule of a configuration must be 0\n",
"or the reader encountered " fname, fileline, msg_size);
"an unexpected EOF. "
"The first message size rule of "
"a configuration must be 0\n",
fname,
fileline,
msg_size);
conf_rules[k].nb_msg_size = l+1;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
/* Get the component identifier for this message size rule */ /* Get the component identifier for this message size rule */
component = getnext(fptr); if( getnext_string(fptr, &target_comp_name) < 0 ) {
if(component < SELF || component >= COMPONENTS_COUNT) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"found an error on dynamic rules file %s " "at line %d: cannot read the name of a collective component\n",
"at line %d: an invalid collective " fname, fileline);
"component id %d is given or the " goto file_reading_error;
"reader encountered an unexpected EOF. " }
"Collective component id must be at " component = mca_coll_han_component_name_to_id(target_comp_name);
if( (component < SELF) || (component >= COMPONENTS_COUNT) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid collective component name %s was given or the "
"reader encountered an unexpected EOF. Collective component id must be at "
"least %d and less than %d\n", "least %d and less than %d\n",
fname, fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT);
fileline, free(target_comp_name);
component,
SELF,
COMPONENTS_COUNT);
conf_rules[k].nb_msg_size = l+1;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error; goto file_reading_error;
} }
/* Store message size rule informations */ /* Store message size rule information */
msg_size_rules[l].collective_id = coll_id; msg_size_rules[l].collective_id = coll_id;
msg_size_rules[l].topologic_level = topo_lvl; msg_size_rules[l].topologic_level = topo_lvl;
msg_size_rules[l].configuration_size = conf_size; msg_size_rules[l].configuration_size = conf_size;
msg_size_rules[l].msg_size = msg_size; msg_size_rules[l].msg_size = msg_size;
msg_size_rules[l].component = component; msg_size_rules[l].component = (COMPONENT_T)component;
nb_entries++; nb_entries++;
/* do we have the optional segment length */
if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n");
long seglength;
if( 0 != topo_lvl ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"file %s line %d found segment lengths for topological collective at level != 0 "
"for collective %s component %s. These values will be ignored.\n",
fname, fileline, coll_name, target_comp_name);
}
while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) {
if( getnext_long(fptr, &seglength) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"file %s line %d found end of file while reading the optional list "
"of segment lengths for collective %s component %s\n",
fname, fileline, coll_name, target_comp_name);
free(target_comp_name);
goto file_reading_error;
}
}
}
free(target_comp_name);
} }
} }
} }
if( NULL != coll_name ) {
free(coll_name);
coll_name = NULL;
}
} }
if(MYEOF != getnext(fptr)) { if( getnext_long(fptr, &nb_coll) > 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: "
"Warning on file %s at line %d: " "rule reading is over but reader does not seem to have reached the end of the file\n",
"rule reading is over but reader does not seem " fname, fileline);
"to have reached the end of the file\n",
fname,
fileline);
} }
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n",
"read %d rules from %s\n", nb_entries, fname);
nb_entries,
fname);
if(mca_coll_han_component.dump_dynamic_rules) { if(mca_coll_han_component.dump_dynamic_rules) {
mca_coll_han_dump_dynamic_rules(); mca_coll_han_dump_dynamic_rules();
@ -447,6 +396,9 @@ cannot_allocate:
return OMPI_ERROR; return OMPI_ERROR;
file_reading_error: file_reading_error:
if( NULL != coll_name ) {
free(coll_name);
}
opal_output_verbose(0, mca_coll_han_component.han_output, opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules " "coll:han:mca_coll_han_init_dynamic_rules "
"could not fully read dynamic rules file. " "could not fully read dynamic rules file. "
@ -531,7 +483,8 @@ static void check_dynamic_rules(void)
configuration_rule_t *conf_rules; configuration_rule_t *conf_rules;
/* Message size informations */ /* Message size informations */
int nb_msg_size, msg_size; int nb_msg_size;
size_t msg_size;
msg_size_rule_t *msg_size_rules; msg_size_rule_t *msg_size_rules;
/* Component informations */ /* Component informations */
@ -540,73 +493,49 @@ static void check_dynamic_rules(void)
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) { for( i = 0; i < nb_coll; i++ ) {
coll_id = coll_rules[i].collective_id; coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels; nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules; topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) { for( j = 0; j < nb_topo; j++ ) {
topo_lvl = topo_rules[j].topologic_level; topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules; nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules; conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_rules ; k++) { for( k = 0; k < nb_rules; k++ ) {
conf_size = conf_rules[k].configuration_size; conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size; nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules; msg_size_rules = conf_rules[k].msg_size_rules;
if(k>=1 && conf_rules[k-1].configuration_size > conf_size) { if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules " "coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"Han found an issue on dynamic rules " "for collective %d on topological level %d: "
"for collective %d " "configuration sizes %d and %d are not sorted by increasing value\n",
"on topological level %d: " coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size);
"configuration sizes %d and %d are "
"not sorted by increasing value\n",
coll_id,
topo_lvl,
conf_rules[k-1].configuration_size,
conf_size);
} }
for(l=0 ; l<nb_msg_size ; l++) { for( l = 0; l < nb_msg_size; l++ ) {
msg_size = msg_size_rules[l].msg_size; msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component; component = msg_size_rules[l].component;
if(l>=1 && msg_size_rules[l-1].msg_size > msg_size) { if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules " "coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"Han found an issue on dynamic rules " "for collective %d on topological level %d with configuration size %d: "
"for collective %d " "message sizes %" PRIsize_t " and %" PRIsize_t " are "
"on topological level %d "
"with configuration size %d: "
"message sizes %d and %d are "
"not sorted by increasing value\n", "not sorted by increasing value\n",
coll_id, coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size);
topo_lvl,
conf_size,
msg_size_rules[l-1].msg_size,
msg_size);
} }
if(HAN == component if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) {
&& GLOBAL_COMMUNICATOR != topo_lvl) {
opal_output_verbose(5, mca_coll_han_component.han_output, opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules " "coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"Han found an issue on dynamic rules " "for collective %d on topological level %d with configuration size %d "
"for collective %d " "for message size %" PRIsize_t ": han collective component %d "
"on topological level %d " "can only be activated for topology level %d\n",
"with configuration size %d " coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR);
"for message size %d: "
"han collective component %d "
"can only be activated for "
"topology level %d\n",
coll_id,
topo_lvl,
conf_size,
msg_size,
HAN,
GLOBAL_COMMUNICATOR);
} }
} }
} }
@ -618,9 +547,6 @@ void mca_coll_han_dump_dynamic_rules(void)
{ {
int nb_entries = 0; int nb_entries = 0;
/* Loop counters */
int i, j, k, l;
/* Collective informations */ /* Collective informations */
int nb_coll; int nb_coll;
COLLTYPE_T coll_id; COLLTYPE_T coll_id;
@ -645,42 +571,32 @@ void mca_coll_han_dump_dynamic_rules(void)
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) { for(int i = 0; i < nb_coll; i++ ) {
coll_id = coll_rules[i].collective_id; coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels; nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules; topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) { for(int j = 0; j < nb_topo; j++ ) {
topo_lvl = topo_rules[j].topologic_level; topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules; nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules; conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_rules ; k++) { for(int k = 0; k < nb_rules; k++ ) {
conf_size = conf_rules[k].configuration_size; conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size; nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules; msg_size_rules = conf_rules[k].msg_size_rules;
for(l=0 ; l<nb_msg_size ; l++) { for(int l = 0; l < nb_msg_size; l++ ) {
msg_size = msg_size_rules[l].msg_size; msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component; component = msg_size_rules[l].component;
opal_output(mca_coll_han_component.han_output, opal_output(mca_coll_han_component.han_output,
"coll:han:dump_dynamic_rules " "coll:han:dump_dynamic_rules %d collective %d (%s) "
"Entry %d " "topology level %d (%s) configuration size %d "
"collective %d (%s) " "mesage size %d -> collective component %d (%s)\n",
"topology level %d (%s) " nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id),
"configuration size %d " topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size,
"mesage size %d " msg_size, component, available_components[component].component_name);
"-> collective component %d (%s)\n",
nb_entries,
coll_id,
mca_coll_han_colltype_to_str(coll_id),
topo_lvl,
mca_coll_han_topo_lvl_to_str(topo_lvl),
conf_size,
msg_size,
component,
components_name[component]);
nb_entries++; nb_entries++;
} }

Просмотреть файл

@ -1,5 +1,8 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
@ -60,10 +63,9 @@
* communicator and the corresponding level for sub-communicators * communicator and the corresponding level for sub-communicators
* created by han. * created by han.
* - Configuration size: * - Configuration size:
* The configuration size is the number of elements of the actual * The configuration size is the number of elements in a topology level.
* topology level in the upper topology level. For example, if * For example, if topology levels are intra-node and inter-node, it can
* topology levels are intra-node and inter-node, it can be the * be the number of MPI ranks per node or the number of nodes in the global
* number of MPI ranks per node or the number of nodes in the global
* communicator. For the GLOBAL_COMMUNICATOR topologic level, * communicator. For the GLOBAL_COMMUNICATOR topologic level,
* the configuration size is the communicator size. * the configuration size is the communicator size.
* - Message_size Component: * - Message_size Component:
@ -101,11 +103,8 @@
* the reader. * the reader.
*/ */
BEGIN_C_DECLS
int mca_coll_han_init_dynamic_rules(void); int mca_coll_han_init_dynamic_rules(void);
void mca_coll_han_free_dynamic_rules(void); void mca_coll_han_free_dynamic_rules(void);
void mca_coll_han_dump_dynamic_rules(void); void mca_coll_han_dump_dynamic_rules(void);
END_C_DECLS
#endif #endif

Просмотреть файл

@ -16,40 +16,45 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
static int mca_coll_han_gather_lg_task(void *task_args);
static int mca_coll_han_gather_ug_task(void *task_args);
/* only work with regular situation (each node has equal number of processes) */ /* only work with regular situation (each node has equal number of processes) */
void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, static inline void
mca_coll_task_t * cur_task, mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args,
void *sbuf, mca_coll_task_t * cur_task,
void *sbuf_inter_free, void *sbuf,
int scount, void *sbuf_inter_free,
struct ompi_datatype_t *sdtype, int scount,
void *rbuf, struct ompi_datatype_t *sdtype,
int rcount, void *rbuf,
struct ompi_datatype_t *rdtype, int rcount,
int root, struct ompi_datatype_t *rdtype,
int root_up_rank, int root,
int root_low_rank, int root_up_rank,
struct ompi_communicator_t *up_comm, int root_low_rank,
struct ompi_communicator_t *low_comm, struct ompi_communicator_t *up_comm,
int w_rank, bool noop, ompi_request_t * req) struct ompi_communicator_t *low_comm,
int w_rank, bool noop, bool is_mapbycore, ompi_request_t * req)
{ {
argu->cur_task = cur_task; args->cur_task = cur_task;
argu->sbuf = sbuf; args->sbuf = sbuf;
argu->sbuf_inter_free = sbuf_inter_free; args->sbuf_inter_free = sbuf_inter_free;
argu->scount = scount; args->scount = scount;
argu->sdtype = sdtype; args->sdtype = sdtype;
argu->rbuf = rbuf; args->rbuf = rbuf;
argu->rcount = rcount; args->rcount = rcount;
argu->rdtype = rdtype; args->rdtype = rdtype;
argu->root = root; args->root = root;
argu->root_up_rank = root_up_rank; args->root_up_rank = root_up_rank;
argu->root_low_rank = root_low_rank; args->root_low_rank = root_low_rank;
argu->up_comm = up_comm; args->up_comm = up_comm;
argu->low_comm = low_comm; args->low_comm = low_comm;
argu->w_rank = w_rank; args->w_rank = w_rank;
argu->noop = noop; args->noop = noop;
argu->req = req; args->is_mapbycore = is_mapbycore;
args->req = req;
} }
int int
@ -61,50 +66,56 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t * module) mca_coll_base_module_t * module)
{ {
int i; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
int w_rank, w_size; /* information about the global communicator */ int w_rank, w_size; /* information about the global communicator */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
char *reorder_buf = NULL, *reorder_rbuf = NULL; char *reorder_buf = NULL, *reorder_rbuf = NULL;
ptrdiff_t rsize, rgap = 0, rextent; int i, err, *vranks, low_rank, low_size, *topo;
int *vranks, low_rank, low_size;
int * topo;
ompi_request_t *temp_request = NULL; ompi_request_t *temp_request = NULL;
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/* Create the subcommunicators */ /* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; err = mca_coll_han_comm_create(comm, han_module);
if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
}
/* Topo must be initialized to know rank distribution which then is used to /* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */ * determine if han can be used */
topo = mca_coll_han_topo_init(comm, han_module, 2); topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. It need to fall back on another component\n")); "han cannot handle gather with this communicator (imbalance). Fall back on another component\n"));
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, /* Put back the fallback collective support and call it once. All
rcount, rdtype, root, * future calls will then be automatically redirected.
comm, han_module->previous_gather_module); */
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
} }
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/* Set up request */ /* Set up request */
temp_request = OBJ_NEW(ompi_request_t); temp_request = OBJ_NEW(ompi_request_t);
OMPI_REQUEST_INIT(temp_request, false);
temp_request->req_state = OMPI_REQUEST_ACTIVE; temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = 0; temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = han_request_free; temp_request->req_free = han_request_free;
temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status = (ompi_status_public_t){0};
temp_request->req_status.MPI_TAG = 0; temp_request->req_complete = REQUEST_PENDING;
temp_request->req_status.MPI_ERROR = 0;
temp_request->req_status._cancelled = 0;
temp_request->req_status._ucount = 0;
/* create the subcommunicators */ /* create the subcommunicators */
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm = ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module];
ompi_communicator_t *up_comm = ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module];
/* Get the 'virtual ranks' mapping correspondong to the communicators */ /* Get the 'virtual ranks' mapping correspondong to the communicators */
vranks = han_module->cached_vranks; vranks = han_module->cached_vranks;
@ -115,10 +126,9 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n",
w_rank, root, root_low_rank, root_up_rank)); w_rank, root, root_low_rank, root_up_rank));
ompi_datatype_type_extent(rdtype, &rextent);
/* Allocate reorder buffers */ /* Allocate reorder buffers */
if (w_rank == root) { if (w_rank == root) {
@ -127,17 +137,30 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
* in a increasing order for both patterns */ * in a increasing order for both patterns */
if (han_module->is_mapbycore) { if (han_module->is_mapbycore) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather is_bycore: ", w_rank)); "[%d]: Han Gather is_bycore: ", w_rank));
reorder_rbuf = (char *)rbuf; reorder_rbuf = (char *)rbuf;
} else { } else {
/* Need a buffer to store unordered final result */ /* Need a buffer to store unordered final result */
ptrdiff_t rsize, rgap;
rsize = opal_datatype_span(&rdtype->super, rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * w_size, (int64_t)rcount * w_size,
&rgap); &rgap);
reorder_buf = (char *)malloc(rsize); //TODO:free reorder_buf = (char *)malloc(rsize); //TODO:free
/* rgap is the size of unused space at the start of the datatype */ /* rgap is the size of unused space at the start of the datatype */
reorder_rbuf = reorder_buf - rgap; reorder_rbuf = reorder_buf - rgap;
if (MPI_IN_PLACE == sbuf) {
ptrdiff_t rextent;
ompi_datatype_type_extent(rdtype, &rextent);
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * w_rank;
ptrdiff_t dest_shift = block_size * w_rank;
ompi_datatype_copy_content_same_ddt(rdtype,
(ptrdiff_t)rcount,
(char *)rbuf + dest_shift,
reorder_rbuf + src_shift);
}
} }
} }
@ -145,12 +168,12 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
/* Create lg task */ /* Create lg task */
mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t);
/* Setup lg task arguments */ /* Setup lg task arguments */
mca_gather_argu_t *lg_argu = malloc(sizeof(mca_gather_argu_t)); mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t));
mac_coll_han_set_gather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf,
rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, rcount, rdtype, root, root_up_rank, root_low_rank, up_comm,
low_comm, w_rank, low_rank != root_low_rank, temp_request); low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, temp_request);
/* Init lg task */ /* Init lg task */
init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_argu)); init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args));
/* Issure lg task */ /* Issure lg task */
issue_task(lg); issue_task(lg);
@ -166,19 +189,21 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
*/ */
/* reorder rbuf based on rank */ /* reorder rbuf based on rank */
if (w_rank == root && !han_module->is_mapbycore) { if (w_rank == root && !han_module->is_mapbycore) {
ptrdiff_t rextent;
ompi_datatype_type_extent(rdtype, &rextent);
for (i=0; i<w_size; i++) { for (i=0; i<w_size; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather copy from %d to %d\n", "[%d]: Han Gather copy from %d to %d\n",
w_rank, w_rank,
i * 2 + 1, i * 2 + 1,
topo[i * 2 + 1])); topo[i * 2 + 1]));
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * i; ptrdiff_t src_shift = block_size * i;
ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * 2 + 1]; ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * 2 + 1];
ompi_datatype_copy_content_same_ddt(rdtype, ompi_datatype_copy_content_same_ddt(rdtype,
(ptrdiff_t)rcount, (ptrdiff_t)rcount,
(char *)rbuf + dest_shift, reorder_rbuf + src_shift,
reorder_rbuf + src_shift); (char *)rbuf + dest_shift);
} }
free(reorder_buf); free(reorder_buf);
} }
@ -187,12 +212,20 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
} }
/* Perform a intra node gather and when it ends launch the inter node gather */ /* Perform a intra node gather and when it ends launch the inter node gather */
int mca_coll_han_gather_lg_task(void *task_argu) int mca_coll_han_gather_lg_task(void *task_args)
{ {
mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: lg\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: lg\n",
t->w_rank)); t->w_rank));
OBJ_RELEASE(t->cur_task); ompi_datatype_t *dtype;
size_t count;
if (t->w_rank == t->root) {
dtype = t->rdtype;
count = t->rcount;
} else {
dtype = t->sdtype;
count = t->scount;
}
/* If the process is one of the node leader */ /* If the process is one of the node leader */
char *tmp_buf = NULL; char *tmp_buf = NULL;
@ -201,33 +234,45 @@ int mca_coll_han_gather_lg_task(void *task_argu)
/* if the process is one of the node leader, allocate the intermediary /* if the process is one of the node leader, allocate the intermediary
* buffer to gather on the low sub communicator */ * buffer to gather on the low sub communicator */
int low_size = ompi_comm_size(t->low_comm); int low_size = ompi_comm_size(t->low_comm);
int low_rank = ompi_comm_rank(t->low_comm);
ptrdiff_t rsize, rgap = 0; ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&t->rdtype->super, rsize = opal_datatype_span(&dtype->super,
(int64_t)t->rcount * low_size, count * low_size,
&rgap); &rgap);
tmp_buf = (char *) malloc(rsize); tmp_buf = (char *) malloc(rsize);
tmp_rbuf = tmp_buf - rgap; tmp_rbuf = tmp_buf - rgap;
if (t->w_rank == t->root) {
if (MPI_IN_PLACE == t->sbuf) {
ptrdiff_t rextent;
ompi_datatype_type_extent(dtype, &rextent);
ptrdiff_t block_size = rextent * (ptrdiff_t)count;
ptrdiff_t src_shift = block_size * t->w_rank;
ptrdiff_t dest_shift = block_size * low_rank;
ompi_datatype_copy_content_same_ddt(dtype,
(ptrdiff_t)count,
tmp_rbuf + dest_shift,
(char *)t->rbuf + src_shift);
}
}
} }
/* shared memory node gather */ /* Low level (usually intra-node or shared memory) node gather */
t->low_comm->c_coll->coll_gather((char *)t->sbuf, t->low_comm->c_coll->coll_gather((char *)t->sbuf,
t->scount, count,
t->sdtype, dtype,
tmp_rbuf, tmp_rbuf,
t->rcount, count,
t->rdtype, dtype,
t->root_low_rank, t->root_low_rank,
t->low_comm, t->low_comm,
t->low_comm->c_coll->coll_gather_module); t->low_comm->c_coll->coll_gather_module);
/* Prepare up comm gather */ /* Prepare up comm gather */
t->sbuf = tmp_rbuf; t->sbuf = tmp_rbuf;
t->sbuf_inter_free = tmp_buf; t->sbuf_inter_free = tmp_buf;
/* Create ug (upper level all-gather) task */ /* Create ug (upper level all-gather) task */
mca_coll_task_t *ug = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *ug = t->cur_task;
/* Setup ug task arguments */
t->cur_task = ug;
/* Init ug task */ /* Init ug task */
init_task(ug, mca_coll_han_gather_ug_task, (void *) t); init_task(ug, mca_coll_han_gather_ug_task, (void *) t);
/* Issure ug task */ /* Issure ug task */
@ -237,26 +282,37 @@ int mca_coll_han_gather_lg_task(void *task_argu)
} }
/* ug: upper level (intra-node) gather task */ /* ug: upper level (intra-node) gather task */
int mca_coll_han_gather_ug_task(void *task_argu) int mca_coll_han_gather_ug_task(void *task_args)
{ {
mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args;
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
if (t->noop) { if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Han Gather: ug noop\n", t->w_rank)); "[%d] Han Gather: ug noop\n", t->w_rank));
} else { } else {
ompi_datatype_t *dtype;
size_t count;
if (t->w_rank == t->root) {
dtype = t->rdtype;
count = t->rcount;
} else {
dtype = t->sdtype;
count = t->scount;
}
int low_size = ompi_comm_size(t->low_comm); int low_size = ompi_comm_size(t->low_comm);
/* inter node gather */ /* inter node gather */
t->up_comm->c_coll->coll_gather((char *)t->sbuf, t->up_comm->c_coll->coll_gather((char *)t->sbuf,
t->scount*low_size, count*low_size,
t->sdtype, dtype,
(char *)t->rbuf, (char *)t->rbuf,
t->rcount*low_size, count*low_size,
t->rdtype, dtype,
t->root_up_rank, t->root_up_rank,
t->up_comm, t->up_comm,
t->up_comm->c_coll->coll_gather_module); t->up_comm->c_coll->coll_gather_module);
if (t->sbuf_inter_free != NULL) { if (t->sbuf_inter_free != NULL) {
free(t->sbuf_inter_free); free(t->sbuf_inter_free);
@ -274,36 +330,56 @@ int mca_coll_han_gather_ug_task(void *task_argu)
/* only work with regular situation (each node has equal number of processes) */ /* only work with regular situation (each node has equal number of processes) */
int int
mca_coll_han_gather_intra_simple(const void *sbuf, int scount, mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module) mca_coll_base_module_t *module)
{ {
int w_rank = ompi_comm_rank(comm); mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
int *topo, w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm); int w_size = ompi_comm_size(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; /* Create the subcommunicators */
/* Topo must be initialized to know rank distribution which then is used to if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */
* determine if han can be used */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* Here root needs to reach all nodes on up_comm.
* But in case of unbalance some up_comms are smaller,
* as the comm_split is made on the base of low_rank */
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. It need to fall back on another component\n")); "han cannot handle gather with this communicator. Fall back on another component\n"));
return han_module->previous_gather(sbuf, scount, sdtype, rbuf, /* HAN cannot work with this communicator so fallback on all collectives */
rcount, rdtype, root, HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
comm, han_module->previous_gather_module); return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
} }
/* create the subcommunicators */
mca_coll_han_comm_create_new(comm, han_module);
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
ompi_datatype_t *dtype;
size_t count;
if (w_rank == root) {
dtype = rdtype;
count = rcount;
} else {
dtype = sdtype;
count = scount;
}
/* Get the 'virtual ranks' mapping corresponding to the communicators */ /* Get the 'virtual ranks' mapping corresponding to the communicators */
int *vranks = han_module->cached_vranks; int *vranks = han_module->cached_vranks;
@ -325,11 +401,11 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
reorder_buf_start = (char *)rbuf; reorder_buf_start = (char *)rbuf;
} else { } else {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Gather needs reordering: ", w_rank)); "[%d]: Future Gather needs reordering: ", w_rank));
ptrdiff_t rgap = 0; ptrdiff_t rgap = 0;
ptrdiff_t rsize = opal_datatype_span(&rdtype->super, ptrdiff_t rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * w_size, (int64_t)rcount * w_size,
&rgap); &rgap);
reorder_buf = (char *)malloc(rsize); reorder_buf = (char *)malloc(rsize);
/* rgap is the size of unused space at the start of the datatype */ /* rgap is the size of unused space at the start of the datatype */
reorder_buf_start = reorder_buf - rgap; reorder_buf_start = reorder_buf - rgap;
@ -338,40 +414,40 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
} }
/* allocate the intermediary buffer /* allocate the intermediary buffer
* * to gather on leaders on the low sub communicator */ * to gather on leaders on the low sub communicator */
char *tmp_buf = NULL; // allocated memory char *tmp_buf = NULL; // allocated memory
char *tmp_buf_start = NULL; // start of the data char *tmp_buf_start = NULL; // start of the data
if (low_rank == root_low_rank) { if (low_rank == root_low_rank) {
ptrdiff_t rsize, rgap = 0; ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&rdtype->super, rsize = opal_datatype_span(&dtype->super,
(int64_t)rcount * low_size, count * low_size,
&rgap); &rgap);
tmp_buf = (char *) malloc(rsize); tmp_buf = (char *) malloc(rsize);
tmp_buf_start = tmp_buf - rgap; tmp_buf_start = tmp_buf - rgap;
} }
/* 1. low gather on nodes leaders */ /* 1. low gather on nodes leaders */
low_comm->c_coll->coll_gather((char *)sbuf, low_comm->c_coll->coll_gather((char *)sbuf,
scount, count,
sdtype, dtype,
tmp_buf_start, tmp_buf_start,
rcount, count,
rdtype, dtype,
root_low_rank, root_low_rank,
low_comm, low_comm,
low_comm->c_coll->coll_gather_module); low_comm->c_coll->coll_gather_module);
/* 2. upper gather (inter-node) between node leaders */ /* 2. upper gather (inter-node) between node leaders */
if (low_rank == root_low_rank) { if (low_rank == root_low_rank) {
up_comm->c_coll->coll_gather((char *)tmp_buf_start, up_comm->c_coll->coll_gather((char *)tmp_buf_start,
scount*low_size, count*low_size,
sdtype, dtype,
(char *)reorder_buf_start, (char *)reorder_buf_start,
rcount*low_size, count*low_size,
rdtype, dtype,
root_up_rank, root_up_rank,
up_comm, up_comm,
up_comm->c_coll->coll_gather_module); up_comm->c_coll->coll_gather_module);
if (tmp_buf != NULL) { if (tmp_buf != NULL) {
free(tmp_buf); free(tmp_buf);
@ -379,7 +455,7 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
tmp_buf_start = NULL; tmp_buf_start = NULL;
} }
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Future Gather: ug gather finish\n", t->w_rank)); "[%d] Future Gather: ug gather finish\n", w_rank));
} }
/* 3. reorder data on root into rbuf /* 3. reorder data on root into rbuf
@ -388,8 +464,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
*/ */
if (w_rank == root && !han_module->is_mapbycore) { if (w_rank == root && !han_module->is_mapbycore) {
ompi_coll_han_reorder_gather(reorder_buf_start, ompi_coll_han_reorder_gather(reorder_buf_start,
rbuf, rcount, rdtype, rbuf, rcount, rdtype,
comm, topo); comm, topo);
free(reorder_buf); free(reorder_buf);
} }
@ -408,28 +484,28 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
*/ */
void void
ompi_coll_han_reorder_gather(const void *sbuf, ompi_coll_han_reorder_gather(const void *sbuf,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *dtype,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
int * topo) { int * topo)
int i; {
int topolevel = 2; // always 2 levels in topo int i, topolevel = 2; // always 2 levels in topo
int w_rank = ompi_comm_rank(comm); int w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm); int w_size = ompi_comm_size(comm);
ptrdiff_t rextent; ptrdiff_t rextent;
ompi_datatype_type_extent(rdtype, &rextent); ompi_datatype_type_extent(dtype, &rextent);
for (i=0; i<w_size; i++) { for ( i = 0; i < w_size; i++ ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future reorder from %d to %d\n", "[%d]: Future reorder from %d to %d\n",
w_rank, w_rank,
i * topolevel + 1, i * topolevel + 1,
topo[i * topolevel + 1])); topo[i * topolevel + 1]));
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * i; ptrdiff_t src_shift = block_size * i;
ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * topolevel + 1]; ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * topolevel + 1];
ompi_datatype_copy_content_same_ddt(rdtype, ompi_datatype_copy_content_same_ddt(dtype,
(ptrdiff_t)rcount, (ptrdiff_t)rcount,
(char *)rbuf + dest_shift, (char *)rbuf + dest_shift,
(char *)sbuf + src_shift); (char *)sbuf + src_shift);
} }
} }

Просмотреть файл

@ -25,21 +25,25 @@ static int han_module_enable(mca_coll_base_module_t * module,
static int mca_coll_han_module_disable(mca_coll_base_module_t * module, static int mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm); struct ompi_communicator_t *comm);
#define CLEAN_PREV_COLL(HANDLE, NAME) \
do { \
(HANDLE)->fallback.NAME.NAME = NULL; \
(HANDLE)->fallback.NAME.module = NULL; \
} while (0)
/* /*
* Module constructor * Module constructor
*/ */
static void han_module_clear(mca_coll_han_module_t *han_module) static void han_module_clear(mca_coll_han_module_t *han_module)
{ {
int i; CLEAN_PREV_COLL(han_module, allgather);
CLEAN_PREV_COLL(han_module, allgatherv);
CLEAN_PREV_COLL(han_module, allreduce);
CLEAN_PREV_COLL(han_module, bcast);
CLEAN_PREV_COLL(han_module, reduce);
CLEAN_PREV_COLL(han_module, gather);
CLEAN_PREV_COLL(han_module, scatter);
for (i = 0; i < COLLCOUNT; i++) {
/*
* Since the previous routines function pointers are declared as
* a union, initializing the dummy routineis enough
*/
han_module->previous_routines[i].previous_routine.dummy = NULL;
han_module->previous_routines[i].previous_module = NULL;
}
han_module->reproducible_reduce = NULL; han_module->reproducible_reduce = NULL;
han_module->reproducible_reduce_module = NULL; han_module->reproducible_reduce_module = NULL;
han_module->reproducible_allreduce = NULL; han_module->reproducible_allreduce = NULL;
@ -50,19 +54,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module)
{ {
int i; int i;
module->enabled = false; module->enabled = true;
module->super.coll_module_disable = mca_coll_han_module_disable; module->super.coll_module_disable = mca_coll_han_module_disable;
module->cached_comm = NULL;
module->cached_low_comms = NULL; module->cached_low_comms = NULL;
module->cached_up_comms = NULL; module->cached_up_comms = NULL;
module->cached_vranks = NULL; module->cached_vranks = NULL;
module->cached_topo = NULL; module->cached_topo = NULL;
module->is_mapbycore = false; module->is_mapbycore = false;
module->storage_initialized = false; module->storage_initialized = false;
for (i = 0 ; i < NB_TOPO_LVL ; i++) { for( i = 0; i < NB_TOPO_LVL; i++ ) {
module->sub_comm[i] = NULL; module->sub_comm[i] = NULL;
} }
for (i=SELF ; i<COMPONENTS_COUNT ; i++) { for( i = SELF; i < COMPONENTS_COUNT; i++ ) {
module->modules_storage.modules[i].module_handler = NULL; module->modules_storage.modules[i].module_handler = NULL;
} }
@ -72,16 +75,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module)
} }
#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \ #define OBJ_RELEASE_IF_NOT_NULL(obj) \
if (NULL != (obj)) { \ do { \
OBJ_RELEASE(obj); \ if (NULL != (obj)) { \
} \ OBJ_RELEASE(obj); \
} while (0) } \
} while (0)
/* /*
* Module destructor * Module destructor
*/ */
static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) static void
mca_coll_han_module_destruct(mca_coll_han_module_t * module)
{ {
int i; int i;
@ -126,7 +131,6 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module)
han_module_clear(module); han_module_clear(module);
} }
OBJ_CLASS_INSTANCE(mca_coll_han_module_t, OBJ_CLASS_INSTANCE(mca_coll_han_module_t,
mca_coll_base_module_t, mca_coll_base_module_t,
mca_coll_han_module_construct, mca_coll_han_module_construct,
@ -155,6 +159,8 @@ int mca_coll_han_init_query(bool enable_progress_threads,
mca_coll_base_module_t * mca_coll_base_module_t *
mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
{ {
int flag;
char info_val[OPAL_MAX_INFO_VAL+1];
mca_coll_han_module_t *han_module; mca_coll_han_module_t *han_module;
/* /*
@ -172,7 +178,13 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
comm->c_contextid, comm->c_name); comm->c_contextid, comm->c_name);
return NULL; return NULL;
} }
if( !ompi_group_have_remote_peers(comm->c_local_group) ) {
/* The group only contains local processes. Disable HAN for now */
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): comm has only local processes; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
/* Get the priority level attached to this module. If priority is less /* Get the priority level attached to this module. If priority is less
* than or equal to 0, then the module is unavailable. */ * than or equal to 0, then the module is unavailable. */
*priority = mca_coll_han_component.han_priority; *priority = mca_coll_han_component.han_priority;
@ -189,52 +201,46 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
} }
/* All is good -- return a module */ /* All is good -- return a module */
han_module->topologic_level = mca_coll_han_component.topo_level; han_module->topologic_level = GLOBAL_COMMUNICATOR;
if (NULL != comm->super.s_info) {
/* Get the info value disaqualifying coll components */
opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level",
sizeof(info_val), info_val, &flag);
if (flag) {
if (0 == strcmp(info_val, "INTER_NODE")) {
han_module->topologic_level = INTER_NODE;
} else {
han_module->topologic_level = INTRA_NODE;
}
}
}
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatterv = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic;
han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic;
han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic;
han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic;
han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic;
/*
* TODO: When the selector is fully implemented,
* this if will be meaningless
*/
if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { if (GLOBAL_COMMUNICATOR == han_module->topologic_level) {
/* We are on the global communicator, return topological algorithms */ /* We are on the global communicator, return topological algorithms */
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic;
han_module->super.coll_allgatherv = NULL; han_module->super.coll_allgatherv = NULL;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic;
han_module->super.coll_scatterv = NULL;
} else { } else {
/* We are on a topologic sub-communicator, return only the selector */ /* We are on a topologic sub-communicator, return only the selector */
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic;
han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic;
han_module->super.coll_scatterv = NULL;
} }
opal_output_verbose(10, ompi_coll_base_framework.framework_output, opal_output_verbose(10, ompi_coll_base_framework.framework_output,
@ -247,28 +253,28 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
/* /*
* In this macro, the following variables are supposed to have been declared * In this macro, the following variables are supposed to have been declared
* in the caller: * in the caller:
* . ompi_communicator_t *comm * . ompi_communicator_t *comm
* . mca_coll_han_module_t *han_module * . mca_coll_han_module_t *han_module
*/ */
#define HAN_SAVE_PREV_COLL_API(__api) do { \ #define HAN_SAVE_PREV_COLL_API(__api) \
han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ do { \
han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\ if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \
if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ opal_output_verbose(1, ompi_coll_base_framework.framework_output, \
opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ "(%d/%s): no underlying " # __api"; disqualifying myself", \
"(%d/%s): no underlying " # __api"; disqualifying myself", \ comm->c_contextid, comm->c_name); \
comm->c_contextid, comm->c_name); \ goto handle_error; \
return OMPI_ERROR; \ } \
} \ han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \
/* TODO add a OBJ_RELEASE at module disabling */ \ han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \
/* + FIXME find why releasing generates memory corruption */ \ OBJ_RETAIN(han_module->previous_ ## __api ## _module); \
OBJ_RETAIN(han_module->previous_ ## __api ## _module); \
} while(0) } while(0)
/* /*
* Init module on the communicator * Init module on the communicator
*/ */
static int han_module_enable(mca_coll_base_module_t * module, static int
struct ompi_communicator_t *comm) han_module_enable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{ {
mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module;
@ -285,13 +291,25 @@ static int han_module_enable(mca_coll_base_module_t * module,
mca_coll_han_allreduce_reproducible_decision(comm, module); mca_coll_han_allreduce_reproducible_decision(comm, module);
return OMPI_SUCCESS; return OMPI_SUCCESS;
handle_error:
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module);
return OMPI_ERROR;
} }
/* /*
* Module disable * Module disable
*/ */
static int mca_coll_han_module_disable(mca_coll_base_module_t * module, static int
struct ompi_communicator_t *comm) mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{ {
mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module;

Просмотреть файл

@ -15,33 +15,38 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, static int mca_coll_han_reduce_t0_task(void *task_args);
int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, static int mca_coll_han_reduce_t1_task(void *task_args);
int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm, static inline void
struct ompi_communicator_t *low_comm, mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t * cur_task, void *sbuf, void *rbuf,
int num_segments, int cur_seg, int w_rank, int last_seg_count, int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op,
bool noop) int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments, int cur_seg, int w_rank, int last_seg_count,
bool noop, bool is_tmp_rbuf)
{ {
argu->cur_task = cur_task; args->cur_task = cur_task;
argu->sbuf = sbuf; args->sbuf = sbuf;
argu->rbuf = rbuf; args->rbuf = rbuf;
argu->seg_count = seg_count; args->seg_count = seg_count;
argu->dtype = dtype; args->dtype = dtype;
argu->op = op; args->op = op;
argu->root_low_rank = root_low_rank; args->root_low_rank = root_low_rank;
argu->root_up_rank = root_up_rank; args->root_up_rank = root_up_rank;
argu->up_comm = up_comm; args->up_comm = up_comm;
argu->low_comm = low_comm; args->low_comm = low_comm;
argu->num_segments = num_segments; args->num_segments = num_segments;
argu->cur_seg = cur_seg; args->cur_seg = cur_seg;
argu->w_rank = w_rank; args->w_rank = w_rank;
argu->last_seg_count = last_seg_count; args->last_seg_count = last_seg_count;
argu->noop = noop; args->noop = noop;
args->is_tmp_rbuf = is_tmp_rbuf;
} }
/* /*
* Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce:
* lb: low level (shared-memory or intra-node) reduce. * lb: low level (shared-memory or intra-node) reduce.
* ub: upper level (inter-node) reduce * ub: upper level (inter-node) reduce
* Hence, in each iteration, there is a combination of collective operations which is called a task. * Hence, in each iteration, there is a combination of collective operations which is called a task.
@ -53,49 +58,62 @@ void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cu
* iter 4 | | | | ur | task: t1, contains ur * iter 4 | | | | ur | task: t1, contains ur
*/ */
int int
mca_coll_han_reduce_intra(const void *sbuf, mca_coll_han_reduce_intra(const void *sbuf,
void *rbuf, void *rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
ompi_op_t* op, ompi_op_t* op,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t * module) mca_coll_base_module_t * module)
{ {
ptrdiff_t extent, lb;
ompi_datatype_get_extent(dtype, &lb, &extent);
int w_rank;
w_rank = ompi_comm_rank(comm);
int seg_count = count;
size_t typelng;
ompi_datatype_type_size(dtype, &typelng);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
/* Do not initialize topology if the operation cannot commute */ ptrdiff_t extent, lb;
if(!ompi_op_is_commute(op)){ int seg_count = count, w_rank;
size_t dtype_size;
/* No support for non-commutative operations */
if(!ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this operation. It needs to fall back on another component\n")); "han cannot handle reduce with this operation. Fall back on another component\n"));
goto prev_reduce_intra; goto prev_reduce_intra;
} }
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all modules */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
}
/* Topo must be initialized to know rank distribution which then is used to /* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */ * determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2); mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){ if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. It needs to fall back on another component\n")); "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n"));
goto prev_reduce_intra; /* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
} }
/* Create the subcommunicators */ ompi_datatype_get_extent(dtype, &lb, &extent);
mca_coll_han_comm_create(comm, han_module); w_rank = ompi_comm_rank(comm);
ompi_datatype_type_size(dtype, &dtype_size);
ompi_communicator_t *low_comm; ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm; ompi_communicator_t *up_comm;
/* use MCA parameters for now */ /* use MCA parameters for now */
low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module];
up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module];
COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, typelng, COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, dtype_size,
seg_count); seg_count);
int num_segments = (count + seg_count - 1) / seg_count; int num_segments = (count + seg_count - 1) / seg_count;
@ -106,6 +124,7 @@ mca_coll_han_reduce_intra(const void *sbuf,
int *vranks = han_module->cached_vranks; int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm); int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm); int low_size = ompi_comm_size(low_comm);
int up_rank = ompi_comm_rank(up_comm);
int root_low_rank; int root_low_rank;
int root_up_rank; int root_up_rank;
@ -114,14 +133,22 @@ mca_coll_han_reduce_intra(const void *sbuf,
"[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank,
root_up_rank)); root_up_rank));
void *tmp_rbuf = rbuf;
void *tmp_rbuf_to_free = NULL;
if (low_rank == root_low_rank && root_up_rank != up_rank) {
/* allocate 2 segments on node leaders that are not the global root */
tmp_rbuf = malloc(2*extent*seg_count);
tmp_rbuf_to_free = tmp_rbuf;
}
/* Create t0 tasks for the first segment */ /* Create t0 tasks for the first segment */
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
/* Setup up t0 task arguments */ /* Setup up t0 task arguments */
mca_reduce_argu_t *t = malloc(sizeof(mca_reduce_argu_t)); mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t));
mac_coll_han_set_reduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) tmp_rbuf, seg_count, dtype,
op, root_up_rank, root_low_rank, up_comm, low_comm, op, root_up_rank, root_low_rank, up_comm, low_comm,
num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count,
low_rank != root_low_rank); low_rank != root_low_rank, (NULL != tmp_rbuf_to_free));
/* Init the first task */ /* Init the first task */
init_task(t0, mca_coll_han_reduce_t0_task, (void *) t); init_task(t0, mca_coll_han_reduce_t0_task, (void *) t);
issue_task(t0); issue_task(t0);
@ -140,7 +167,9 @@ mca_coll_han_reduce_intra(const void *sbuf,
/* Setup up t1 task arguments */ /* Setup up t1 task arguments */
t->cur_task = t1; t->cur_task = t1;
t->sbuf = (char *) t->sbuf + extent * t->seg_count; t->sbuf = (char *) t->sbuf + extent * t->seg_count;
t->rbuf = (char *) t->rbuf + extent * t->seg_count; if (up_rank == root_up_rank) {
t->rbuf = (char *) t->rbuf + extent * t->seg_count;
}
t->cur_seg = t->cur_seg + 1; t->cur_seg = t->cur_seg + 1;
/* Init the t1 task */ /* Init the t1 task */
init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); init_task(t1, mca_coll_han_reduce_t1_task, (void *) t);
@ -148,19 +177,20 @@ mca_coll_han_reduce_intra(const void *sbuf,
} }
free(t); free(t);
free(tmp_rbuf_to_free);
return OMPI_SUCCESS; return OMPI_SUCCESS;
prev_reduce_intra: prev_reduce_intra:
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm,
han_module->previous_reduce_module); han_module->previous_reduce_module);
} }
/* t0 task: issue and wait for the low level reduce of segment 0 */ /* t0 task: issue and wait for the low level reduce of segment 0 */
int mca_coll_han_reduce_t0_task(void *task_argu) int mca_coll_han_reduce_t0_task(void *task_args)
{ {
mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank,
t->cur_seg)); t->cur_seg));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
@ -173,41 +203,55 @@ int mca_coll_han_reduce_t0_task(void *task_argu)
} }
/* t1 task */ /* t1 task */
int mca_coll_han_reduce_t1_task(void *task_argu) { int mca_coll_han_reduce_t1_task(void *task_args) {
mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank,
t->cur_seg)); t->cur_seg));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb; ptrdiff_t extent, lb;
int cur_seg = t->cur_seg;
ompi_datatype_get_extent(t->dtype, &lb, &extent); ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *ireduce_req = NULL; ompi_request_t *ireduce_req = NULL;
int tmp_count = t->seg_count;
if (!t->noop) { if (!t->noop) {
int tmp_count = t->seg_count;
if (cur_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
int up_rank = ompi_comm_rank(t->up_comm); int up_rank = ompi_comm_rank(t->up_comm);
/* ur of cur_seg */ /* ur of cur_seg */
if (up_rank == t->root_up_rank) { if (up_rank == t->root_up_rank) {
t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, tmp_count, t->dtype,
t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->op, t->root_up_rank, t->up_comm, &ireduce_req,
t->up_comm->c_coll->coll_ireduce_module); t->up_comm->c_coll->coll_ireduce_module);
} else { } else {
t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, /* this is a node leader that is not root so alternate between the two allocated segments */
char *tmp_sbuf = (char*)t->rbuf + (cur_seg % 2)*(extent * t->seg_count);
t->up_comm->c_coll->coll_ireduce(tmp_sbuf, NULL, tmp_count,
t->dtype, t->op, t->root_up_rank, t->up_comm, t->dtype, t->op, t->root_up_rank, t->up_comm,
&ireduce_req, t->up_comm->c_coll->coll_ireduce_module); &ireduce_req, t->up_comm->c_coll->coll_ireduce_module);
} }
} }
/* lr of cur_seg+1 */ /* lr of cur_seg+1 */
if (t->cur_seg <= t->num_segments - 2) { int next_seg = cur_seg + 1;
if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { if (next_seg <= t->num_segments - 1) {
int tmp_count = t->seg_count;
char *tmp_rbuf = NULL;
if (next_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count; tmp_count = t->last_seg_count;
} }
if (t->is_tmp_rbuf) {
tmp_rbuf = (char*)t->rbuf + (next_seg % 2)*(extent * t->seg_count);
} else if (NULL != t->rbuf) {
tmp_rbuf = (char*)t->rbuf + extent * t->seg_count;
}
t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count,
(char *) t->rbuf + extent * t->seg_count, tmp_count, (char *) tmp_rbuf, tmp_count,
t->dtype, t->op, t->root_low_rank, t->low_comm, t->dtype, t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module); t->low_comm->c_coll->coll_reduce_module);
} }
if (!t->noop && ireduce_req) { if (!t->noop && ireduce_req) {
ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE);
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -217,13 +261,13 @@ int mca_coll_han_reduce_t1_task(void *task_argu) {
* a fallback is made on the next component that provides a reduce in priority order */ * a fallback is made on the next component that provides a reduce in priority order */
int int
mca_coll_han_reduce_intra_simple(const void *sbuf, mca_coll_han_reduce_intra_simple(const void *sbuf,
void* rbuf, void* rbuf,
int count, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
ompi_op_t *op, ompi_op_t *op,
int root, int root,
struct ompi_communicator_t *comm, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module) mca_coll_base_module_t *module)
{ {
int w_rank; /* information about the global communicator */ int w_rank; /* information about the global communicator */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
@ -234,23 +278,37 @@ mca_coll_han_reduce_intra_simple(const void *sbuf,
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* Do not initialize topology if the operation cannot commute */ /* No support for non-commutative operations */
if(!ompi_op_is_commute(op)){ if(!ompi_op_is_commute(op)){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this operation. It needs to fall back on another component\n")); "han cannot handle reduce with this operation. Fall back on another component\n"));
goto prev_reduce_intra_simple; goto prev_reduce_intra;
}
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
} }
/* Topo must be initialized to know rank distribution which then is used to /* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */ * determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2); mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){ if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. It needs to fall back on another component\n")); "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n"));
goto prev_reduce_intra_simple; /* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
} }
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm = ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module];
ompi_communicator_t *up_comm = ompi_communicator_t *up_comm =
@ -289,7 +347,7 @@ mca_coll_han_reduce_intra_simple(const void *sbuf,
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"HAN/REDUCE: low comm reduce failed. " "HAN/REDUCE: low comm reduce failed. "
"Falling back to another component\n")); "Falling back to another component\n"));
goto prev_reduce_intra_simple; goto prev_reduce_intra;
} }
/* Up_comm reduce */ /* Up_comm reduce */
@ -315,10 +373,9 @@ mca_coll_han_reduce_intra_simple(const void *sbuf,
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
prev_reduce_intra_simple: prev_reduce_intra:
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm, han_module->previous_reduce_module);
han_module->previous_reduce_module);
} }
@ -341,15 +398,14 @@ mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm,
int i; int i;
for (i=0; i<fallbacks_len; i++) { for (i=0; i<fallbacks_len; i++) {
int fallback = fallbacks[i]; int fallback = fallbacks[i];
mca_coll_base_module_t *fallback_module = han_module->modules_storage mca_coll_base_module_t *fallback_module
.modules[fallback] = han_module->modules_storage.modules[fallback].module_handler;
.module_handler;
if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { if (fallback_module != NULL && fallback_module->coll_reduce != NULL) {
if (0 == w_rank) { if (0 == w_rank) {
opal_output_verbose(30, mca_coll_han_component.han_output, opal_output_verbose(30, mca_coll_han_component.han_output,
"coll:han:reduce_reproducible: " "coll:han:reduce_reproducible: "
"fallback on %s\n", "fallback on %s\n",
components_name[fallback]); available_components[fallback].component_name);
} }
han_module->reproducible_reduce_module = fallback_module; han_module->reproducible_reduce_module = fallback_module;
han_module->reproducible_reduce = fallback_module->coll_reduce; han_module->reproducible_reduce = fallback_module->coll_reduce;

Просмотреть файл

@ -15,96 +15,105 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h" #include "coll_han_trigger.h"
static int mca_coll_han_scatter_us_task(void *task_args);
static int mca_coll_han_scatter_ls_task(void *task_args);
/* Only work with regular situation (each node has equal number of processes) */ /* Only work with regular situation (each node has equal number of processes) */
void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, static inline void
mca_coll_task_t * cur_task, mca_coll_han_set_scatter_args(mca_coll_han_scatter_args_t * args,
void *sbuf, mca_coll_task_t * cur_task,
void *sbuf_inter_free, void *sbuf,
void *sbuf_reorder_free, void *sbuf_inter_free,
int scount, void *sbuf_reorder_free,
struct ompi_datatype_t *sdtype, int scount,
void *rbuf, struct ompi_datatype_t *sdtype,
int rcount, void *rbuf,
struct ompi_datatype_t *rdtype, int rcount,
int root, struct ompi_datatype_t *rdtype,
int root_up_rank, int root,
int root_low_rank, int root_up_rank,
struct ompi_communicator_t *up_comm, int root_low_rank,
struct ompi_communicator_t *low_comm, struct ompi_communicator_t *up_comm,
int w_rank, bool noop, ompi_request_t * req) struct ompi_communicator_t *low_comm,
int w_rank, bool noop, ompi_request_t * req)
{ {
argu->cur_task = cur_task; args->cur_task = cur_task;
argu->sbuf = sbuf; args->sbuf = sbuf;
argu->sbuf_inter_free = sbuf_inter_free; args->sbuf_inter_free = sbuf_inter_free;
argu->sbuf_reorder_free = sbuf_reorder_free; args->sbuf_reorder_free = sbuf_reorder_free;
argu->scount = scount; args->scount = scount;
argu->sdtype = sdtype; args->sdtype = sdtype;
argu->rbuf = rbuf; args->rbuf = rbuf;
argu->rcount = rcount; args->rcount = rcount;
argu->rdtype = rdtype; args->rdtype = rdtype;
argu->root = root; args->root = root;
argu->root_up_rank = root_up_rank; args->root_up_rank = root_up_rank;
argu->root_low_rank = root_low_rank; args->root_low_rank = root_low_rank;
argu->up_comm = up_comm; args->up_comm = up_comm;
argu->low_comm = low_comm; args->low_comm = low_comm;
argu->w_rank = w_rank; args->w_rank = w_rank;
argu->noop = noop; args->noop = noop;
argu->req = req; args->req = req;
} }
int int
mca_coll_han_scatter_intra(const void *sbuf, int scount, mca_coll_han_scatter_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, void *rbuf, int rcount,
struct ompi_datatype_t *rdtype, struct ompi_datatype_t *rdtype,
int root, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module) struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{ {
int i, j; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
int w_rank, w_size; int i, j, w_rank, w_size;
w_rank = ompi_comm_rank(comm); w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm); w_size = ompi_comm_size(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; /* Create the subcommunicators */
int *topo = mca_coll_han_topo_init(comm, han_module, 2); if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle scatter with this communicator. It needs to fall back on another component\n")); "han cannot handle scatter with this communicator. Fall back on another component\n"));
goto prev_scatter_intra; /* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
comm, comm->c_coll->coll_scatter_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
int* topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle scatter with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatter);
return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
comm, comm->c_coll->coll_scatter_module);
} }
/* Create the subcommunicators */
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm = ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module];
ompi_communicator_t *up_comm = ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module];
int *vranks = han_module->cached_vranks; int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm); int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm); int low_size = ompi_comm_size(low_comm);
int up_size = ompi_comm_size(up_comm); int up_size = ompi_comm_size(up_comm);
ompi_request_t *temp_request = NULL;
/* Set up request */ /* Set up request */
temp_request = OBJ_NEW(ompi_request_t); ompi_request_t *temp_request = OBJ_NEW(ompi_request_t);
OMPI_REQUEST_INIT(temp_request, false);
temp_request->req_state = OMPI_REQUEST_ACTIVE; temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = 0; temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = han_request_free; temp_request->req_free = han_request_free;
temp_request->req_status.MPI_SOURCE = 0; temp_request->req_status = (ompi_status_public_t){0};
temp_request->req_status.MPI_TAG = 0; temp_request->req_complete = REQUEST_PENDING;
temp_request->req_status.MPI_ERROR = 0;
temp_request->req_status._cancelled = 0;
temp_request->req_status._ucount = 0;
int root_low_rank; int root_low_rank;
int root_up_rank; int root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank,
@ -149,42 +158,55 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount,
} }
} }
void *dest_buf = rbuf;
int dest_count = rcount;
ompi_datatype_t *dest_dtype = rdtype;
if (MPI_IN_PLACE == rbuf) {
dest_buf = (void*)sbuf;
dest_count = scount;
dest_dtype = sdtype;
}
/* Create us task */ /* Create us task */
mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t);
/* Setup us task arguments */ /* Setup us task arguments */
mca_scatter_argu_t *us_argu = malloc(sizeof(mca_scatter_argu_t)); mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t));
mac_coll_han_set_scatter_argu(us_argu, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype,
(char *) rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, (char *) dest_buf, dest_count, dest_dtype, root, root_up_rank, root_low_rank,
up_comm, low_comm, w_rank, low_rank != root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank,
temp_request); temp_request);
/* Init us task */ /* Init us task */
init_task(us, mca_coll_han_scatter_us_task, (void *) (us_argu)); init_task(us, mca_coll_han_scatter_us_task, (void *) (us_args));
/* Issure us task */ /* Issure us task */
issue_task(us); issue_task(us);
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
return OMPI_SUCCESS; return OMPI_SUCCESS;
prev_scatter_intra:
return han_module->previous_scatter(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm,
han_module->previous_scatter_module);
} }
/* us: upper level (intra-node) scatter task */ /* us: upper level (intra-node) scatter task */
int mca_coll_han_scatter_us_task(void *task_argu) int mca_coll_han_scatter_us_task(void *task_args)
{ {
mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args;
OBJ_RELEASE(t->cur_task);
if (t->noop) { if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n",
t->w_rank)); t->w_rank));
} else { } else {
size_t count;
ompi_datatype_t *dtype;
if (t->w_rank == t->root) {
dtype = t->sdtype;
count = t->scount;
} else {
dtype = t->rdtype;
count = t->rcount;
}
int low_size = ompi_comm_size(t->low_comm); int low_size = ompi_comm_size(t->low_comm);
ptrdiff_t rsize, rgap = 0; ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); rsize = opal_datatype_span(&dtype->super, (int64_t) count * low_size, &rgap);
char *tmp_buf = (char *) malloc(rsize); char *tmp_buf = (char *) malloc(rsize);
char *tmp_rbuf = tmp_buf - rgap; char *tmp_rbuf = tmp_buf - rgap;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
@ -202,9 +224,7 @@ int mca_coll_han_scatter_us_task(void *task_argu)
t->sbuf_reorder_free = NULL; t->sbuf_reorder_free = NULL;
} }
/* Create ls tasks for the current union segment */ /* Create ls tasks for the current union segment */
mca_coll_task_t *ls = OBJ_NEW(mca_coll_task_t); mca_coll_task_t *ls = t->cur_task;
/* Setup up ls task arguments */
t->cur_task = ls;
/* Init ls task */ /* Init ls task */
init_task(ls, mca_coll_han_scatter_ls_task, (void *) t); init_task(ls, mca_coll_han_scatter_ls_task, (void *) t);
/* Issure ls task */ /* Issure ls task */
@ -213,14 +233,14 @@ int mca_coll_han_scatter_us_task(void *task_argu)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* ls: lower level (shared memory) scatter task */ /* ls: lower level (shared memory or intra-node) scatter task */
int mca_coll_han_scatter_ls_task(void *task_argu) int mca_coll_han_scatter_ls_task(void *task_args)
{ {
mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n", OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n",
t->w_rank)); t->w_rank));
OBJ_RELEASE(t->cur_task); OBJ_RELEASE(t->cur_task);
/* Shared memory scatter */
t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf, t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf,
t->rcount, t->rdtype, t->root_low_rank, t->low_comm, t->rcount, t->rdtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_scatter_module); t->low_comm->c_coll->coll_scatter_module);

Просмотреть файл

@ -26,157 +26,100 @@
#include "coll_han.h" #include "coll_han.h"
#include "coll_han_dynamic.h" #include "coll_han_dynamic.h"
#define HAN_SUBCOM_SAVE_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \
do { \
(FALLBACKS).COLL.COLL = (COMM)->c_coll->coll_ ## COLL; \
(FALLBACKS).COLL.module = (COMM)->c_coll->coll_ ## COLL ## _module; \
(COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \
} while(0)
/* #define HAN_SUBCOM_LOAD_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \
* Local functions do { \
*/ (COMM)->c_coll->coll_ ## COLL = (FALLBACKS).COLL.COLL; \
static void create_intranode_comm_new(ompi_communicator_t *, (COMM)->c_coll->coll_ ## COLL ## _module = (FALLBACKS).COLL.module; \
ompi_communicator_t **); } while(0)
static void create_internode_comm_new(ompi_communicator_t *,
int, int,
ompi_communicator_t **);
static void create_intranode_comm(ompi_communicator_t *,
const char *,
int,
ompi_communicator_t **);
static void create_internode_comm(ompi_communicator_t *,
const char *,
int, int,
ompi_communicator_t **);
/**
* Create a sub-communicator containing the ranks that share my node.
*
* @param comm (IN) original communicator for the collective
* target module priority
* @param sub_comm (OUT) created sub-communicator
*/
static void create_intranode_comm_new(ompi_communicator_t *comm,
ompi_communicator_t **sub_comm)
{
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
(opal_info_t *)(&ompi_mpi_info_null), sub_comm);
return;
}
/**
* Create a sub-communicator containing one rank per node.
*
* @param comm (IN) original communicator for the collective
* @param my_rank (IN) my rank in comm
* @param intra_rank (IN) local rank in the intra-node sub-communicator
* @param sub_comm (OUT) created sub-communicator
*/
static void create_internode_comm_new(ompi_communicator_t *comm,
int my_rank,
int intra_rank,
ompi_communicator_t **sub_comm)
{
ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false);
return;
}
/* /*
* Routine that creates the local hierarchical sub-communicators * Routine that creates the local hierarchical sub-communicators
* Called each time a collective is called. * Called each time a collective is called.
* comm: input communicator of the collective * comm: input communicator of the collective
*/ */
void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module) mca_coll_han_module_t *han_module)
{ {
int low_rank, low_size; int low_rank, low_size, up_rank, w_rank, w_size;
int up_rank;
int w_rank;
int w_size;
ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]);
ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]);
const int *origin_priority; mca_coll_han_collectives_fallback_t fallbacks;
int han_var_id;
int tmp_han_priority;
int vrank, *vranks; int vrank, *vranks;
opal_info_t comm_info;
mca_coll_base_module_allreduce_fn_t old_allreduce;
mca_coll_base_module_t *old_allreduce_module;
mca_coll_base_module_allgather_fn_t old_allgather;
mca_coll_base_module_t *old_allgather_module;
mca_coll_base_module_bcast_fn_t old_bcast;
mca_coll_base_module_t *old_bcast_module;
mca_coll_base_module_gather_fn_t old_gather;
mca_coll_base_module_t *old_gather_module;
mca_coll_base_module_reduce_fn_t old_reduce;
mca_coll_base_module_t *old_reduce_module;
/* The sub communicators have already been created */ /* The sub communicators have already been created */
if (NULL != han_module->sub_comm[INTRA_NODE] if (han_module->enabled && NULL != han_module->sub_comm[INTRA_NODE]
&& NULL != han_module->sub_comm[INTER_NODE] && NULL != han_module->sub_comm[INTER_NODE]
&& NULL != han_module->cached_vranks) { && NULL != han_module->cached_vranks) {
return; return OMPI_SUCCESS;
} }
/* /*
* We cannot use han allreduce and allgather without sub-communicators * We cannot use han allreduce and allgather without sub-communicators,
* Temporary set previous ones * but we are in the creation of the data structures for the HAN, and
* temporarily need to save back the old collective.
* *
* Allgather is used to compute vranks * Allgather is used to compute vranks
* Allreduce is used by ompi_comm_split_type in create_intranode_comm_new * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new
* Reduce + Bcast may be called by the allreduce implementation * Reduce + Bcast may be called by the allreduce implementation
* Gather + Bcast may be called by the allgather implementation * Gather + Bcast may be called by the allgather implementation
*/ */
old_allreduce = comm->c_coll->coll_allreduce; HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
old_allreduce_module = comm->c_coll->coll_allreduce_module; HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter);
old_allgather = comm->c_coll->coll_allgather; /**
old_allgather_module = comm->c_coll->coll_allgather_module; * HAN is not yet optimized for a single process per node case, we should
* avoid selecting it for collective communication support in such cases.
* However, in order to decide if this is tru, we need to know how many
* local processes are on each node, a condition that cannot be verified
* outside the MPI support (with PRRTE the info will be eventually available,
* but we don't want to delay anything until then). We can achieve the same
* goal by using a reduction over the maximum number of peers per node among
* all participants.
*/
int local_procs = ompi_group_count_local_peers(comm->c_local_group);
comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT,
MPI_MAX, comm,
comm->c_coll->coll_allreduce_module);
if( local_procs == 1 ) {
/* restore saved collectives */
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
han_module->enabled = false; /* entire module set to pass-through from now on */
return OMPI_ERR_NOT_SUPPORTED;
}
old_reduce = comm->c_coll->coll_reduce; OBJ_CONSTRUCT(&comm_info, opal_info_t);
old_reduce_module = comm->c_coll->coll_reduce_module;
old_bcast = comm->c_coll->coll_bcast;
old_bcast_module = comm->c_coll->coll_bcast_module;
old_gather = comm->c_coll->coll_gather;
old_gather_module = comm->c_coll->coll_gather_module;
comm->c_coll->coll_allreduce = han_module->previous_allreduce;
comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module;
comm->c_coll->coll_allgather = han_module->previous_allgather;
comm->c_coll->coll_allgather_module = han_module->previous_allgather_module;
comm->c_coll->coll_reduce = han_module->previous_reduce;
comm->c_coll->coll_reduce_module = han_module->previous_reduce_module;
comm->c_coll->coll_bcast = han_module->previous_bcast;
comm->c_coll->coll_bcast_module = han_module->previous_bcast_module;
comm->c_coll->coll_gather = han_module->previous_gather;
comm->c_coll->coll_gather_module = han_module->previous_gather_module;
/* Create topological sub-communicators */ /* Create topological sub-communicators */
w_rank = ompi_comm_rank(comm); w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm); w_size = ompi_comm_size(comm);
origin_priority = NULL;
mca_base_var_find_by_name("coll_han_priority", &han_var_id);
mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL);
/*
* Maximum priority for selector on sub-communicators
*/
tmp_han_priority = 100;
mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/* /*
* This sub-communicator contains the ranks that share my node. * This sub-communicator contains the ranks that share my node.
*/ */
mca_coll_han_component.topo_level = INTRA_NODE; opal_info_set(&comm_info, "ompi_comm_coll_preference", "han");
create_intranode_comm_new(comm, low_comm); opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, low_comm);
/* /*
* Get my local rank and the local size * Get my local rank and the local size
@ -188,8 +131,8 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
* This sub-communicator contains one process per node: processes with the * This sub-communicator contains one process per node: processes with the
* same intra-node rank id share such a sub-communicator * same intra-node rank id share such a sub-communicator
*/ */
mca_coll_han_component.topo_level = INTER_NODE; opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE");
create_internode_comm_new(comm, w_rank, low_rank, up_comm); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false);
up_rank = ompi_comm_rank(*up_comm); up_rank = ompi_comm_rank(*up_comm);
@ -208,216 +151,116 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
* vrank * vrank
*/ */
comm->c_coll->coll_allgather(&vrank, comm->c_coll->coll_allgather(&vrank,
1, 1,
MPI_INT, MPI_INT,
vranks, vranks,
1, 1,
MPI_INT, MPI_INT,
comm, comm,
comm->c_coll->coll_allgather_module); comm->c_coll->coll_allgather_module);
/* /*
* Set the cached info * Set the cached info
*/ */
han_module->cached_vranks = vranks; han_module->cached_vranks = vranks;
/* /* Reset the saved collectives to point back to HAN */
* Come back to the original han module priority HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
*/ HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
MCA_BASE_VAR_SOURCE_SET, NULL); HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
/* Put allreduce, allgather, reduce and bcast back */ OBJ_DESTRUCT(&comm_info);
comm->c_coll->coll_allreduce = old_allreduce; return OMPI_SUCCESS;
comm->c_coll->coll_allreduce_module = old_allreduce_module;
comm->c_coll->coll_allgather = old_allgather;
comm->c_coll->coll_allgather_module = old_allgather_module;
comm->c_coll->coll_reduce = old_reduce;
comm->c_coll->coll_reduce_module = old_reduce_module;
comm->c_coll->coll_bcast = old_bcast;
comm->c_coll->coll_bcast_module = old_bcast_module;
comm->c_coll->coll_gather = old_gather;
comm->c_coll->coll_gather_module = old_gather_module;
mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR;
} }
/**
* Create a sub-communicator containing the ranks that share my node.
* Associate this sub-communicator a given collective module.
* module can be one of:
* . sm
* . shared
*
* @param comm (IN) original communicator for the collective
* @param prio_string (IN) string containing the mca variable associated to
* target module priority
* @param my_rank (IN) my rank in comm
* @param sub_comm (OUT) created sub-communicator
*/
static void create_intranode_comm(ompi_communicator_t *comm,
const char *prio_string,
int my_rank,
ompi_communicator_t **sub_comm)
{
int var_id;
const int *sav_priority;
int tmp_priority = 100;
/*
* Upgrade the target module priority to make the resulting sub-communicator
* use that collective module
*/
mca_base_var_find_by_name(prio_string, &var_id);
mca_base_var_get_value(var_id, &sav_priority, NULL, NULL);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] %s origin %d\n",
my_rank, prio_string, *sav_priority));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/*
* Create the sub-communicator
* Since the target priority has been set to the highest value, this
* sub-communicator will inherit it as a collective module.
*/
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
(opal_info_t *)(&ompi_mpi_info_null), sub_comm);
/*
* Come back to the target module's original priority
*/
mca_base_var_set_value(var_id, sav_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
return;
}
/**
* Create a sub-communicator containing one rank per node.
* Associate this sub-communicator a given collective module.
* module can be one of:
* . libnbc
* . adapt
*
* @param comm (IN) original communicator for the collective
* @param prio_string (IN) string containing the mca variable associated to
* target module priority
* @param my_rank (IN) my rank in comm
* @param intra_rank (IN) local rank in the intra-node sub-communicator
* @param sub_comm (OUT) created sub-communicator
*/
static void create_internode_comm(ompi_communicator_t *comm,
const char *prio_string,
int my_rank,
int intra_rank,
ompi_communicator_t **sub_comm)
{
int var_id;
const int *sav_priority;
int tmp_priority = 100;
/*
* Upgrade the target module priority to make the resulting sub-communicator
* use that collective module
*/
mca_base_var_find_by_name(prio_string, &var_id);
mca_base_var_get_value(var_id, &sav_priority, NULL, NULL);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] %s origin %d\n", my_rank, prio_string,
*sav_priority));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/*
* Create the sub-communicator
* Since the target priority has been set to the highest value, this
* sub-communicator will inherit it as a collective module.
*/
ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false);
mca_base_var_set_value(var_id, sav_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
return;
}
/* /*
* Routine that creates the local hierarchical sub-communicators * Routine that creates the local hierarchical sub-communicators
* Called each time a collective is called. * Called each time a collective is called.
* comm: input communicator of the collective * comm: input communicator of the collective
*/ */
void mca_coll_han_comm_create(struct ompi_communicator_t *comm, int mca_coll_han_comm_create(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module) mca_coll_han_module_t *han_module)
{ {
int low_rank, low_size; int low_rank, low_size, up_rank, w_rank, w_size;
int up_rank; mca_coll_han_collectives_fallback_t fallbacks;
int w_rank;
int w_size;
ompi_communicator_t **low_comms; ompi_communicator_t **low_comms;
ompi_communicator_t **up_comms; ompi_communicator_t **up_comms;
const int *origin_priority;
int han_var_id;
int tmp_han_priority;
int vrank, *vranks; int vrank, *vranks;
opal_info_t comm_info;
mca_coll_base_module_allreduce_fn_t old_allreduce;
mca_coll_base_module_t *old_allreduce_module;
mca_coll_base_module_allgather_fn_t old_allgather;
mca_coll_base_module_t *old_allgather_module;
/* use cached communicators if possible */ /* use cached communicators if possible */
if (han_module->cached_comm == comm && if (han_module->enabled && han_module->cached_low_comms != NULL &&
han_module->cached_low_comms != NULL && han_module->cached_up_comms != NULL &&
han_module->cached_up_comms != NULL && han_module->cached_vranks != NULL) {
han_module->cached_vranks != NULL) { return OMPI_SUCCESS;
return;
} }
/* We cannot use han allreduce and allgather without sub-communicators /*
* Temporary set previous ones */ * We cannot use han allreduce and allgather without sub-communicators,
old_allreduce = comm->c_coll->coll_allreduce; * but we are in the creation of the data structures for the HAN, and
old_allreduce_module = comm->c_coll->coll_allreduce_module; * temporarily need to save back the old collective.
*
* Allgather is used to compute vranks
* Allreduce is used by ompi_comm_split_type in create_intranode_comm_new
* Reduce + Bcast may be called by the allreduce implementation
* Gather + Bcast may be called by the allgather implementation
*/
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter);
old_allgather = comm->c_coll->coll_allgather; /**
old_allgather_module = comm->c_coll->coll_allgather_module; * HAN is not yet optimized for a single process per node case, we should
* avoid selecting it for collective communication support in such cases.
comm->c_coll->coll_allreduce = han_module->previous_allreduce; * However, in order to decide if this is tru, we need to know how many
comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; * local processes are on each node, a condition that cannot be verified
* outside the MPI support (with PRRTE the info will be eventually available,
comm->c_coll->coll_allgather = han_module->previous_allgather; * but we don't want to delay anything until then). We can achieve the same
comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; * goal by using a reduction over the maximum number of peers per node among
* all participants.
*/
int local_procs = ompi_group_count_local_peers(comm->c_local_group);
comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT,
MPI_MAX, comm,
comm->c_coll->coll_allreduce_module);
if( local_procs == 1 ) {
/* restore saved collectives */
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
han_module->enabled = false; /* entire module set to pass-through from now on */
return OMPI_ERR_NOT_SUPPORTED;
}
/* create communicators if there is no cached communicator */ /* create communicators if there is no cached communicator */
w_rank = ompi_comm_rank(comm); w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm); w_size = ompi_comm_size(comm);
low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES *
sizeof(struct ompi_communicator_t *)); sizeof(struct ompi_communicator_t *));
up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES *
sizeof(struct ompi_communicator_t *)); sizeof(struct ompi_communicator_t *));
origin_priority = NULL;
mca_base_var_find_by_name("coll_han_priority", &han_var_id);
mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL);
/* OBJ_CONSTRUCT(&comm_info, opal_info_t);
* Lower down our current priority
*/
tmp_han_priority = 0;
mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/* /*
* Upgrade sm module priority to set up low_comms[0] with sm module * Upgrade sm module priority to set up low_comms[0] with sm module
* This sub-communicator contains the ranks that share my node. * This sub-communicator contains the ranks that share my node.
*/ */
create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0])); opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, &(low_comms[0]));
/* /*
* Get my local rank and the local size * Get my local rank and the local size
@ -429,15 +272,17 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm,
* Upgrade shared module priority to set up low_comms[1] with shared module * Upgrade shared module priority to set up low_comms[1] with shared module
* This sub-communicator contains the ranks that share my node. * This sub-communicator contains the ranks that share my node.
*/ */
create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1])); opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, &(low_comms[1]));
/* /*
* Upgrade libnbc module priority to set up up_comms[0] with libnbc module * Upgrade libnbc module priority to set up up_comms[0] with libnbc module
* This sub-communicator contains one process per node: processes with the * This sub-communicator contains one process per node: processes with the
* same intra-node rank id share such a sub-communicator * same intra-node rank id share such a sub-communicator
*/ */
create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank, opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han");
&(up_comms[0])); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false);
up_rank = ompi_comm_rank(up_comms[0]); up_rank = ompi_comm_rank(up_comms[0]);
@ -445,8 +290,8 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm,
* Upgrade adapt module priority to set up up_comms[0] with adapt module * Upgrade adapt module priority to set up up_comms[0] with adapt module
* This sub-communicator contains one process per node. * This sub-communicator contains one process per node.
*/ */
create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank, opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han");
&(up_comms[1])); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false);
/* /*
* Set my virtual rank number. * Set my virtual rank number.
@ -468,23 +313,21 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm,
/* /*
* Set the cached info * Set the cached info
*/ */
han_module->cached_comm = comm;
han_module->cached_low_comms = low_comms; han_module->cached_low_comms = low_comms;
han_module->cached_up_comms = up_comms; han_module->cached_up_comms = up_comms;
han_module->cached_vranks = vranks; han_module->cached_vranks = vranks;
/* /* Reset the saved collectives to point back to HAN */
* Come back to the original han module priority HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
*/ HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
MCA_BASE_VAR_SOURCE_SET, NULL); HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
/* Put allreduce and allgather back */ OBJ_DESTRUCT(&comm_info);
comm->c_coll->coll_allreduce = old_allreduce; return OMPI_SUCCESS;
comm->c_coll->coll_allreduce_module = old_allreduce_module;
comm->c_coll->coll_allgather = old_allgather;
comm->c_coll->coll_allgather_module = old_allgather_module;
} }

Просмотреть файл

@ -35,244 +35,24 @@
#include "coll_han.h" #include "coll_han.h"
/* #if OPAL_ENABLE_DEBUG
* Local functions static void
*/ mca_coll_han_topo_print(int *topo,
struct ompi_communicator_t *comm,
static int mca_coll_han_hostname_to_number(char* hostname, int size); int num_topo_level)
static void mca_coll_han_topo_get(int *topo,
struct ompi_communicator_t* comm,
int num_topo_level);
static void mca_coll_han_topo_sort(int *topo, int start, int end,
int level, int num_topo_level);
static bool mca_coll_han_topo_is_mapbycore(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level);
static void mca_coll_han_topo_print(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level);
/*
* takes the number part of a host: hhh2031 -->2031
*/
static int mca_coll_han_hostname_to_number(char* hostname, int size)
{ {
int i, j; int rank = ompi_comm_rank(comm);
char *number_array = (char *)malloc(sizeof(char) * size);
int number = 0;
for (i = 0, j = 0; hostname[i] != '\0'; i++) {
if ('0' <= hostname[i] && '9' >= hostname[i]) {
number_array[j++] = hostname[i];
}
}
number_array[j] = '\0';
number = atoi(number_array);
free(number_array);
return number;
}
/*
* Set the virtual topo id. It is made of num_topo_level ints (2 today):
* . the integer part of the host id
* . the rank in the main communicator
* Gather the virtual topoid from each process so every process will know other
* processes virtual topids
*/
static void mca_coll_han_topo_get(int *topo,
struct ompi_communicator_t* comm,
int num_topo_level)
{
int *self_topo = (int *)malloc(sizeof(int) * num_topo_level);
char hostname[1024];
gethostname(hostname, 1024);
self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024);
self_topo[1] = ompi_comm_rank(comm);
ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT,
topo, num_topo_level, MPI_INT, comm,
comm->c_coll->coll_allgather_module);
free(self_topo);
return;
}
/*
* Sort the topology array in order to have ranks sharing the same node
* contiguous in the topology array.
* Called from topo_init whenever the processes are not mapped by core.
* ex: 4 ranks executing on 2 nodes, mapped by node
* ranks 0 and 2 on hid0
* ranks 1 and 3 on hid1
* On entry the topo array looks like
* hid0 0 hid1 1 hid0 2 hid1 3
* After the sort:
* hid0 0 hid0 2 hid1 1 hid1 3
* This is to have the gather result in the right order
*
* @param topo (IN/OUT) topology description array (sorted in out)
* @param start (IN) where to begin the processing
* The index in topo will actually be:
* start * num_topo_level + level
* topo contains num_topo_level ids per rank.
* @param end (IN) where to stop the processing
* The index in topo will actually be:
* end * num_topo_level + level
* topo contains num_topo_level ids per rank.
* @param level (IN) level number we are currently processing
* @param num_topo_level (IN) number of topological levels
*
*/
static void mca_coll_han_topo_sort(int *topo, int start, int end,
int level, int num_topo_level)
{
int i, j;
int min, min_loc;
int last, new_start, new_end;
if (level > num_topo_level-1 || start >= end) {
return;
}
min = INT_MAX;
min_loc = -1;
for (i = start; i <= end; i++) {
int temp;
/* get the min value for current level and its location */
for (j = i; j <= end; j++) {
/* topo contains num_topo_level ids per rank. */
if (topo[j * num_topo_level + level] < min) {
min = topo[j*num_topo_level+level];
min_loc = j;
}
}
/*
* swap i and min_loc
* We have num_topo_level ids to swap
*/
for (j = 0; j < num_topo_level; j++) {
temp = topo[i * num_topo_level + j];
topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j];
topo[min_loc * num_topo_level + j] = temp;
}
min = INT_MAX;
min_loc = -1;
}
/* Process next level */
last = 0;
new_start = 0;
new_end = 0;
for (i = start; i <= end; i++) {
if (i == start) {
last = topo[i * num_topo_level + level];
new_start = start;
} else if (i == end) {
new_end = end;
mca_coll_han_topo_sort(topo, new_start, new_end, level + 1,
num_topo_level);
} else if (last != topo[i * num_topo_level + level]) {
new_end = i - 1;
mca_coll_han_topo_sort(topo, new_start, new_end, level + 1,
num_topo_level);
new_start = i;
last = topo[i * num_topo_level + level];
}
}
return;
}
/*
* Check whether the ranks in the communicator given as input are mapped by core
* Mapped by core: each node is first filled with as many ranks as needed before
* moving to the next one
* This is checked as follows:
* . 2 contiguous ranks should be either on the same node or on node ids in
* ascending order
* The topology is actually an array of ints:
* +----------+-------+----------+-------+------+----------+-------+-----+
* | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... |
* +----------+-------+----------+-------+------+----------+-------+-----+
*/
static bool mca_coll_han_topo_is_mapbycore(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level)
{
int i;
int size = ompi_comm_size(comm); int size = ompi_comm_size(comm);
for (i = 1; i < size; i++) { if (rank == 0) {
/* OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank));
* The host id for a given rank should be < host id for the next rank for( int i = 0; i < size*num_topo_level; i++ ) {
*/ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i]));
if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) {
return false;
}
/*
* For the same host id, consecutive ranks should be sorted in
* ascending order.
*/
if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) {
return false;
} }
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n"));
} }
return true;
} }
#endif /* OPAL_ENABLE_DEBUG */
/* The topo is supposed sorted by host */
static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level){
int i;
int size = ompi_comm_size(comm);
if (size < 2){
return false;
}
int ppn;
int last_host = topo[0];
/* Find the ppn for the first node */
for (i = 1; i < size; i++) {
if (topo[i * num_topo_level] != last_host){
break;
}
}
ppn = i;
/* All on one node */
if ( size == ppn){
return false;
}
/* Trivial case */
if (size % ppn != 0){
return true;
}
last_host = topo[ppn * num_topo_level];
/* Check that the 2nd and next hosts also this ppn. Since the topo is sorted
* one just need to jump ppn ranks to check the supposed switch of host */
for (i = 2 * ppn; i < size; i += ppn ){
/* the list of ranks for the last known host have ended before */
if (topo[(i-1) * num_topo_level] != last_host){
return true;
}
/* the list of ranks for the last known host are bigger than excpected */
if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){
return true;
}
last_host = topo[i * num_topo_level];
}
/* Check the last host */
if (topo[(size-1) * num_topo_level] != last_host){
return true;
}
return false;
}
/** /**
* Topology initialization phase * Topology initialization phase
@ -280,68 +60,136 @@ static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo,
* *
* @param num_topo_level (IN) Number of the topological levels * @param num_topo_level (IN) Number of the topological levels
*/ */
int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, int*
mca_coll_han_module_t *han_module, mca_coll_han_topo_init(struct ompi_communicator_t *comm,
int num_topo_level) mca_coll_han_module_t *han_module,
int num_topo_level)
{ {
int size; if ( NULL != han_module->cached_topo ) {
int *topo; return han_module->cached_topo;
size = ompi_comm_size(comm);
if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) {
if (han_module->cached_topo) {
free(han_module->cached_topo);
han_module->cached_topo = NULL;
}
topo = (int *)malloc(sizeof(int) * size * num_topo_level);
/* get topo infomation */
mca_coll_han_topo_get(topo, comm, num_topo_level);
mca_coll_han_topo_print(topo, comm, num_topo_level);
/*
* All the ranks now have the topo information
*/
/* check if the processes are mapped by core */
han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level);
/*
* If not, sort the topo such that each group of ids is sorted by rank
* i.e. ids for rank i are contiguous to ids for rank i+1.
* This will be needed for the operations that are order sensitive
* (like gather)
*/
if (!han_module->is_mapbycore) {
mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level);
}
han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level);
han_module->cached_topo = topo;
han_module->cached_comm = comm;
} else {
topo = han_module->cached_topo;
} }
ompi_communicator_t *up_comm, *low_comm;
ompi_request_t *request = MPI_REQUEST_NULL;
int *my_low_rank_map = NULL;
int *ranks_map = NULL;
int size = ompi_comm_size(comm);
if (NULL != han_module->cached_up_comms) {
up_comm = han_module->cached_up_comms[0];
low_comm = han_module->cached_low_comms[0];
} else {
up_comm = han_module->sub_comm[INTER_NODE];
low_comm = han_module->sub_comm[INTRA_NODE];
}
assert(up_comm != NULL && low_comm != NULL);
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int *topo = (int *)malloc(sizeof(int) * size * num_topo_level);
int is_imbalanced = 1;
int ranks_consecutive = 1;
/* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */
if (0 == low_rank) {
my_low_rank_map = malloc(sizeof(int)*low_size);
for (int i = 0; i < low_size; ++i) {
topo[i] = i;
}
ompi_group_translate_ranks(low_comm->c_local_group, low_size, topo,
comm->c_local_group, my_low_rank_map);
/* check if ranks are consecutive */
int rank = my_low_rank_map[0] + 1;
for (int i = 1; i < low_size; ++i, ++rank) {
if (my_low_rank_map[i] != rank) {
ranks_consecutive = 0;
break;
}
}
int reduce_vals[] = {ranks_consecutive, -ranks_consecutive, low_size, -low_size};
up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4,
MPI_INT, MPI_MAX, up_comm,
up_comm->c_coll->coll_allreduce_module);
/* is the distribution of processes balanced per node? */
is_imbalanced = (reduce_vals[2] == -reduce_vals[3]) ? 0 : 1;
ranks_consecutive = (reduce_vals[0] == -reduce_vals[1]) ? 1 : 0;
if ( !ranks_consecutive && !is_imbalanced ) {
/* kick off up_comm allgather to collect non-consecutive rank information at node leaders */
ranks_map = malloc(sizeof(int)*size);
up_comm->c_coll->coll_iallgather(my_low_rank_map, low_size, MPI_INT,
ranks_map, low_size, MPI_INT, up_comm, &request,
up_comm->c_coll->coll_iallgather_module);
}
}
/* broadcast balanced and consecutive properties from node leaders to remaining ranks */
int bcast_vals[] = {is_imbalanced, ranks_consecutive};
low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0,
low_comm, low_comm->c_coll->coll_bcast_module);
is_imbalanced = bcast_vals[0];
ranks_consecutive = bcast_vals[1];
/* error out if the rank distribution is not balanced */
if (is_imbalanced) {
assert(MPI_REQUEST_NULL == request);
han_module->are_ppn_imbalanced = true;
free(topo);
if( NULL != my_low_rank_map ) free(my_low_rank_map);
if( NULL != ranks_map ) free(ranks_map);
return NULL;
}
han_module->are_ppn_imbalanced = false;
if (ranks_consecutive) {
/* fast-path: all ranks are consecutive and balanced so fill topology locally */
for (int i = 0; i < size; ++i) {
topo[2*i] = (i/low_size); // node leader is node ID
topo[2*i+1] = i;
}
han_module->is_mapbycore = true;
} else {
/*
* Slow path: gather global-to-node-local rank mappings at node leaders
*
* The topology will contain a mapping from global consecutive positions
* to ranks in the communicator.
*
* ex: 4 ranks executing on 2 nodes, mapped by node
* ranks 0 and 2 on hid0
* ranks 1 and 3 on hid1
* On entry the topo array looks like
* hid0 0 hid1 1 hid0 2 hid1 3
* After the sort:
* hid0 0 hid0 2 hid1 1 hid1 3
*/
if (0 == low_rank) {
ompi_request_wait(&request, MPI_STATUS_IGNORE);
/* fill topology */
for (int i = 0; i < size; ++i) {
topo[2*i] = ranks_map[(i/low_size)*low_size]; // node leader is node ID
topo[2*i+1] = ranks_map[i];
}
free(ranks_map);
}
}
/* broadcast topology from node leaders to remaining ranks */
low_comm->c_coll->coll_bcast(topo, num_topo_level*size, MPI_INT, 0,
low_comm, low_comm->c_coll->coll_bcast_module);
free(my_low_rank_map);
han_module->cached_topo = topo;
#if OPAL_ENABLE_DEBUG
mca_coll_han_topo_print(topo, comm, num_topo_level); mca_coll_han_topo_print(topo, comm, num_topo_level);
#endif /* OPAL_ENABLE_DEBUG */
return topo; return topo;
} }
static void mca_coll_han_topo_print(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level)
{
int rank = ompi_comm_rank(comm);
int size = ompi_comm_size(comm);
if (rank == 0) {
int i;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank));
for (i=0; i<size*num_topo_level; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i]));
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n"));
}
}

Просмотреть файл

@ -14,29 +14,14 @@
static void mca_coll_task_constructor(mca_coll_task_t * t) static void mca_coll_task_constructor(mca_coll_task_t * t)
{ {
t->func_ptr = NULL; t->func_ptr = NULL;
t->func_argu = NULL; t->func_args = NULL;
} }
static void mca_coll_task_destructor(mca_coll_task_t * t) static void mca_coll_task_destructor(mca_coll_task_t * t)
{ {
t->func_ptr = NULL; t->func_ptr = NULL;
t->func_argu = NULL; t->func_args = NULL;
} }
OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor,
mca_coll_task_destructor); mca_coll_task_destructor);
/* Init task */
int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu)
{
t->func_ptr = func_ptr;
t->func_argu = func_argu;
return OMPI_SUCCESS;
}
/* Issue the task */
int issue_task(mca_coll_task_t * t)
{
t->func_ptr(t->func_argu);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -12,25 +12,17 @@
#ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H #ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H
#define MCA_COLL_HAN_TRIGGER_EXPORT_H #define MCA_COLL_HAN_TRIGGER_EXPORT_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/communicator/communicator.h" #include "ompi/communicator/communicator.h"
#include "ompi/win/win.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "opal/util/info.h"
#include "ompi/op/op.h" #include "ompi/op/op.h"
#include "opal/runtime/opal_progress.h" #include "ompi/datatype/ompi_datatype.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/coll/base/coll_tags.h"
typedef int (*task_func_ptr) (void *); typedef int (*task_func_ptr) (void *);
struct mca_coll_task_s { struct mca_coll_task_s {
opal_object_t super; opal_object_t super;
task_func_ptr func_ptr; task_func_ptr func_ptr;
void *func_argu; void *func_args;
}; };
typedef struct mca_coll_task_s mca_coll_task_t; typedef struct mca_coll_task_s mca_coll_task_t;
@ -38,9 +30,20 @@ typedef struct mca_coll_task_s mca_coll_task_t;
OBJ_CLASS_DECLARATION(mca_coll_task_t); OBJ_CLASS_DECLARATION(mca_coll_task_t);
/* Init task */ /* Init task */
int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu); static inline int
init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args)
{
OBJ_CONSTRUCT(t, mca_coll_task_t);
t->func_ptr = func_ptr;
t->func_args = func_args;
return OMPI_SUCCESS;
}
/* Issue the task */ /* Issue the task */
int issue_task(mca_coll_task_t * t); static inline int
issue_task(mca_coll_task_t * t)
{
return t->func_ptr(t->func_args);
}
#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ #endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */

Просмотреть файл

@ -1,58 +0,0 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
/* Get root's low_rank and up_rank from vranks array */
void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank,
int *root_up_rank)
{
*root_up_rank = vranks[root] / low_size;
*root_low_rank = vranks[root] % low_size;
}
uint32_t han_auto_tuned_get_n(uint32_t n)
{
uint32_t avail[5] = { 4, 8, 16, 32, 64 };
uint32_t i;
for (i = 0; i < 5; i++) {
if (avail[i] >= n) {
return i;
}
}
return i - 1;
}
uint32_t han_auto_tuned_get_c(uint32_t c)
{
uint32_t avail[3] = { 4, 8, 12 };
uint32_t i;
for (i = 0; i < 3; i++) {
if (avail[i] >= c) {
return i;
}
}
return i - 1;
}
uint32_t han_auto_tuned_get_m(uint32_t m)
{
uint32_t avail[21] =
{ 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072,
262144, 524288, 1048576, 2097152, 4194304 };
uint32_t i;
for (i = 0; i < 21; i++) {
if (avail[i] >= m) {
return i;
}
}
return i - 1;
}

Просмотреть файл

@ -174,7 +174,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority)
if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) { if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output, opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name); "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name);
return NULL; return NULL;
} }
/* Get the priority level attached to this module. If priority is less /* Get the priority level attached to this module. If priority is less

Просмотреть файл

@ -1446,7 +1446,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount,
communicator_size = ompi_comm_size(comm); communicator_size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm); rank = ompi_comm_rank(comm);
if (rank == root) { /* Determine block size */
if ( (rank == root) || (MPI_IN_PLACE == sbuf) ) {
ompi_datatype_type_size(rdtype, &dsize); ompi_datatype_type_size(rdtype, &dsize);
total_dsize = dsize * (ptrdiff_t)rcount; total_dsize = dsize * (ptrdiff_t)rcount;
} else { } else {

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University * Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -40,7 +40,7 @@
static int fileline=0; /* used for verbose error messages */ static int fileline=0; /* used for verbose error messages */
#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) #define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval)
/* /*
* Reads a rule file called fname * Reads a rule file called fname
@ -56,9 +56,8 @@ static int fileline=0; /* used for verbose error messages */
int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives)
{ {
long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS;
FILE *fptr = (FILE*) NULL; FILE *fptr = (FILE*) NULL;
int X, CI, NCS, CS, ALG, NMS, FANINOUT;
long MS, SS;
int x, ncs, nms; int x, ncs, nms;
ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */
@ -101,45 +100,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
goto on_file_error; goto on_file_error;
} }
X = (int)getnext(fptr); if( (getnext(fptr, &X) < 0) || (X < 0) ) {
if (X<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline));
goto on_file_error; goto on_file_error;
} }
if (X>n_collectives) { if (X>n_collectives) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline));
goto on_file_error; goto on_file_error;
} }
for (x=0;x<X;x++) { /* for each collective */ for (x=0;x<X;x++) { /* for each collective */
CI = (int)getnext (fptr); if( (getnext(fptr, &CI) < 0) || (CI < 0) ) {
if (CI<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline));
goto on_file_error; goto on_file_error;
} }
if (CI>=n_collectives) { if (CI>=n_collectives) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline));
goto on_file_error; goto on_file_error;
} }
if (alg_rules[CI].alg_rule_id != CI) { if (alg_rules[CI].alg_rule_id != CI) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI)); OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI));
goto on_file_error; goto on_file_error;
} }
OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI)); OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI));
alg_p = &alg_rules[CI]; alg_p = &alg_rules[CI];
alg_p->alg_rule_id = CI; alg_p->alg_rule_id = CI;
alg_p->n_com_sizes = 0; alg_p->n_com_sizes = 0;
alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL;
NCS = (int)getnext (fptr); if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) {
if (NCS<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline));
goto on_file_error; goto on_file_error;
} }
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI)); OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI));
alg_p->n_com_sizes = NCS; alg_p->n_com_sizes = NCS;
alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI);
if (NULL == alg_p->com_rules) { if (NULL == alg_p->com_rules) {
@ -151,20 +147,18 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
com_p = &(alg_p->com_rules[ncs]); com_p = &(alg_p->com_rules[ncs]);
CS = (int)getnext (fptr); if( (getnext (fptr, &CS) < 0) || (CS < 0) ) {
if (CS<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
goto on_file_error; goto on_file_error;
} }
com_p->mpi_comsize = CS; com_p->mpi_comsize = CS;
NMS = (int)getnext (fptr); if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) {
if (NMS<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
goto on_file_error; goto on_file_error;
} }
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n", OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n",
NMS, CI, CS)); NMS, CI, CS));
com_p->n_msg_sizes = NMS; com_p->n_msg_sizes = NMS;
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
@ -179,37 +173,33 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
msg_p = &(com_p->msg_rules[nms]); msg_p = &(com_p->msg_rules[nms]);
MS = getnext (fptr); if( (getnext (fptr, &MS) < 0) || (MS < 0) ) {
if (MS<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error; goto on_file_error;
} }
msg_p->msg_size = (size_t)MS; msg_p->msg_size = (size_t)MS;
ALG = (int)getnext (fptr); if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) {
if (ALG<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error; goto on_file_error;
} }
msg_p->result_alg = ALG; msg_p->result_alg = ALG;
FANINOUT = (int)getnext (fptr); if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) {
if (FANINOUT<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error; goto on_file_error;
} }
msg_p->result_topo_faninout = FANINOUT; msg_p->result_topo_faninout = FANINOUT;
SS = getnext (fptr); if( (getnext (fptr, &SS) < 0) || (SS < 0) ) {
if (SS<0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error; goto on_file_error;
} }
msg_p->result_segsize = SS; msg_p->result_segsize = SS;
if (!nms && MS) { if (!nms && MS) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n"));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline));
goto on_file_error; goto on_file_error;
} }
@ -222,7 +212,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
} /* comm size */ } /* comm size */
total_alg_count++; total_alg_count++;
OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI)); OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI));
} /* per collective */ } /* per collective */

Просмотреть файл

@ -54,7 +54,7 @@ static void ompi_request_construct(ompi_request_t* req)
/* don't call _INIT, we don't to set the request to _INACTIVE and there will /* don't call _INIT, we don't to set the request to _INACTIVE and there will
* be no matching _FINI invocation */ * be no matching _FINI invocation */
req->req_state = OMPI_REQUEST_INVALID; req->req_state = OMPI_REQUEST_INVALID;
req->req_complete = false; req->req_complete = REQUEST_COMPLETED;
req->req_persistent = false; req->req_persistent = false;
req->req_start = NULL; req->req_start = NULL;
req->req_free = NULL; req->req_free = NULL;