1
1

Merge pull request #7735 from bosilca/coll/han

A hierarchical, architecture-aware collective communication module
Этот коммит содержится в:
bosilca 2020-10-26 00:07:03 -04:00 коммит произвёл GitHub
родитель 6304c3f57c cc6432b4a2
Коммит ce97090673
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
33 изменённых файлов: 6925 добавлений и 118 удалений

Просмотреть файл

@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group,
/**********************************************************************/
/**********************************************************************/
/**********************************************************************/
/*
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
*/
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
ompi_communicator_t **newcomm, bool pass_on_topo )
int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
opal_info_t *info,
ompi_communicator_t **newcomm, bool pass_on_topo )
{
int myinfo[2];
int size, my_size;
@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d",
newcomp->c_contextid, comm->c_contextid );
/* Copy info if there is one */
if (info) {
newcomp->super.s_info = OBJ_NEW(opal_info_t);
opal_info_dup(info, &(newcomp->super.s_info));
}
/* Activate the communicator and init coll-component */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
}
/*
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
*/
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
ompi_communicator_t **newcomm, bool pass_on_topo )
{
return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo);
}
/**********************************************************************/
/**********************************************************************/
/**********************************************************************/

Просмотреть файл

@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm,
OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key,
ompi_communicator_t** newcomm, bool pass_on_topo);
/**
* split a communicator based on color and key. Parameters
* are identical to the MPI-counterpart of the function.
* Similar to \see ompi_comm_split with an additional info parameter.
*
* @param comm: input communicator
* @param color
* @param key
*
* @
*/
OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
opal_info_t *info,
ompi_communicator_t **newcomm, bool pass_on_topo );
/**
* split a communicator based on type and key. Parameters
* are identical to the MPI-counterpart of the function.

Просмотреть файл

@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group)
return false;
}
/**
* Count the number of processes on this group that share the same node as
* this process.
*/
int ompi_group_count_local_peers (ompi_group_t *group)
{
int local_peers = 0;
for (int i = 0 ; i < group->grp_proc_count ; ++i) {
ompi_proc_t *proc = NULL;
#if OMPI_GROUP_SPARSE
proc = ompi_group_peer_lookup (group, i);
#else
proc = ompi_group_get_proc_ptr_raw (group, i);
if (ompi_proc_is_sentinel (proc)) {
/* the proc must be stored in the group or cached in the proc
* hash table if the process resides in the local node
* (see ompi_proc_complete_init) */
continue;
}
#endif
if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
local_peers++;
}
}
return local_peers;
}

Просмотреть файл

@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t
return ompi_group_get_proc_ptr (group, peer_id, false);
}
/**
* Return true if all processes in the group are not on the local node.
*/
bool ompi_group_have_remote_peers (ompi_group_t *group);
/**
* Count the number of processes on the local node.
*/
int ompi_group_count_local_peers (ompi_group_t *group);
/**
* Function to print the group info
*/

Просмотреть файл

@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req)
|| (context->con->tree->tree_nextsize > 0 && rank != context->con->root
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs
&& num_recv_fini == context->con->num_segs)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n",
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n",
ompi_comm_rank(context->con->comm)));
ibcast_request_fini(context);
}
@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req)
&& num_recv_fini == context->con->num_segs)
|| (context->con->tree->tree_nextsize == 0
&& num_recv_fini == context->con->num_segs)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n",
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n",
ompi_comm_rank(context->con->comm)));
ibcast_request_fini(context);
}

Просмотреть файл

@ -21,6 +21,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -37,6 +38,7 @@
#include "mpi.h"
#include "ompi/communicator/communicator.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_object.h"
@ -44,20 +46,12 @@
#include "opal/mca/base/base.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/*
* Local types
* Stuff for the OBJ interface
*/
struct avail_coll_t {
opal_list_item_t super;
int ac_priority;
mca_coll_base_module_2_3_0_t *ac_module;
const char * ac_component_name;
};
typedef struct avail_coll_t avail_coll_t;
OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL);
/*
* Local functions
@ -77,12 +71,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t *
int *priority,
mca_coll_base_module_2_3_0_t ** module);
/*
* Stuff for the OBJ interface
*/
static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL);
#define COPY(module, comm, func) \
do { \
if (NULL != module->coll_ ## func) { \
@ -138,11 +126,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
/* FIX ME - Do some kind of collective operation to find a module
that everyone has available */
/* List to store every valid module */
comm->c_coll->module_list = OBJ_NEW(opal_list_t);
/* do the selection loop */
for (item = opal_list_remove_first(selectable);
NULL != item; item = opal_list_remove_first(selectable)) {
avail_coll_t *avail = (avail_coll_t *) item;
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
/* initialize the module */
ret = avail->ac_module->coll_module_enable(avail->ac_module, comm);
@ -153,6 +144,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
(OMPI_SUCCESS == ret ? "Enabled": "Disabled") );
if (OMPI_SUCCESS == ret) {
/* Save every component that is initialized,
* queried and enabled successfully */
opal_list_append(comm->c_coll->module_list, &avail->super);
/* copy over any of the pointers */
COPY(avail->ac_module, comm, allgather);
@ -230,10 +224,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
COPY(avail->ac_module, comm, neighbor_alltoallw_init);
COPY(avail->ac_module, comm, reduce_local);
} else {
/* release the original module reference and the list item */
OBJ_RELEASE(avail->ac_module);
OBJ_RELEASE(avail);
}
/* release the original module reference and the list item */
OBJ_RELEASE(avail->ac_module);
OBJ_RELEASE(avail);
}
/* Done with the list from the check_components() call so release it. */
@ -306,8 +301,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
static int avail_coll_compare (opal_list_item_t **a,
opal_list_item_t **b) {
avail_coll_t *acoll = (avail_coll_t *) *a;
avail_coll_t *bcoll = (avail_coll_t *) *b;
mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a;
mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b;
if (acoll->ac_priority > bcoll->ac_priority) {
return 1;
@ -318,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a,
return 0;
}
static inline int
component_in_argv(char **argv, const char* component_name)
{
if( NULL != argv ) {
while( NULL != *argv ) {
if( 0 == strcmp(component_name, *argv) ) {
return 1;
}
argv++; /* move to the next argument */
}
}
return 0;
}
/*
* For each module in the list, check and see if it wants to run, and
* do the resulting priority comparison. Make a list of modules to be
@ -327,13 +336,66 @@ static int avail_coll_compare (opal_list_item_t **a,
static opal_list_t *check_components(opal_list_t * components,
ompi_communicator_t * comm)
{
int priority;
int priority, flag;
const mca_base_component_t *component;
mca_base_component_list_item_t *cli;
mca_coll_base_module_2_3_0_t *module;
opal_list_t *selectable;
avail_coll_t *avail;
mca_coll_base_avail_coll_t *avail;
char info_val[OPAL_MAX_INFO_VAL+1];
char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL;
/* Check if this communicator comes with restrictions on the collective modules
* it wants to use. The restrictions are consistent with the MCA parameter
* to limit the collective components loaded, but it applies for each
* communicator and is provided as an info key during the communicator
* creation. Unlike the MCA param, this info key is used not to select
* components but either to prevent components from being used or to
* force a change in the component priority.
*/
if( NULL != comm->super.s_info) {
opal_info_get(comm->super.s_info, "ompi_comm_coll_preference",
sizeof(info_val), info_val, &flag);
if( !flag ) {
goto proceed_to_select;
}
coll_argv = opal_argv_split(info_val, ',');
if(NULL == coll_argv) {
goto proceed_to_select;
}
int idx2, count_include = opal_argv_count(coll_argv);
/* Allocate the coll_include argv */
coll_include = (char**)malloc((count_include + 1) * sizeof(char*));
coll_include[count_include] = NULL; /* NULL terminated array */
/* Dispatch the include/exclude in the corresponding arrays */
for( int idx = 0; NULL != coll_argv[idx]; idx++ ) {
if( '^' == coll_argv[idx][0] ) {
coll_include[idx] = NULL; /* NULL terminated array */
/* Allocate the coll_exclude argv */
coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*));
/* save the exclude components */
for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) {
coll_exclude[idx2 - idx] = coll_argv[idx2];
}
coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */
coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */
count_include = idx;
break;
}
coll_include[idx] = coll_argv[idx];
}
/* Reverse the order of the coll_inclide argv to faciliate the ordering of
* the selected components reverse.
*/
for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) {
char* temp = coll_include[idx2];
coll_include[idx2] = coll_include[count_include - 1];
coll_include[count_include - 1] = temp;
count_include--;
}
}
proceed_to_select:
/* Make a list of the components that query successfully */
selectable = OBJ_NEW(opal_list_t);
@ -341,11 +403,18 @@ static opal_list_t *check_components(opal_list_t * components,
OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) {
component = cli->cli_component;
/* dont bother is we have this component in the exclusion list */
if( component_in_argv(coll_exclude, component->mca_component_name) ) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:base:comm_select: component disqualified: %s (due to communicator info key)",
component->mca_component_name );
continue;
}
priority = check_one_component(comm, component, &module);
if (priority >= 0) {
/* We have a component that indicated that it wants to run
by giving us a module */
avail = OBJ_NEW(avail_coll_t);
avail = OBJ_NEW(mca_coll_base_avail_coll_t);
avail->ac_priority = priority;
avail->ac_module = module;
// Point to the string so we don't have to free later
@ -376,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components,
/* Put this list in priority order */
opal_list_sort(selectable, avail_coll_compare);
/* For all valid component reorder them not on their provided priorities but on
* the order requested in the info key. As at this point the coll_include is
* already ordered backward we can simply prepend the components.
*/
mca_coll_base_avail_coll_t *item, *item_next;
OPAL_LIST_FOREACH_SAFE(item, item_next,
selectable, mca_coll_base_avail_coll_t) {
if( component_in_argv(coll_include, item->ac_component_name) ) {
opal_list_remove_item(selectable, &item->super);
opal_list_prepend(selectable, &item->super);
}
}
opal_argv_free(coll_argv);
if( NULL != coll_exclude ) {
free(coll_exclude);
}
if( NULL != coll_include ) {
free(coll_include);
}
/* All done */
return selectable;
}
@ -409,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm,
return priority;
}
/**************************************************************************
* Query functions
**************************************************************************/

Просмотреть файл

@ -16,6 +16,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -34,6 +35,7 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#define CLOSE(comm, func) \
do { \
@ -50,6 +52,8 @@
int mca_coll_base_comm_unselect(ompi_communicator_t * comm)
{
opal_list_item_t *item;
CLOSE(comm, allgather);
CLOSE(comm, allgatherv);
CLOSE(comm, allreduce);
@ -124,6 +128,17 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm)
CLOSE(comm, reduce_local);
for (item = opal_list_remove_first(comm->c_coll->module_list);
NULL != item; item = opal_list_remove_first(comm->c_coll->module_list)) {
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
if(avail->ac_module) {
OBJ_RELEASE(avail->ac_module);
}
OBJ_RELEASE(avail);
}
OBJ_RELEASE(comm->c_coll->module_list);
free(comm->c_coll);
comm->c_coll = NULL;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -29,6 +29,8 @@
#include "ompi/mca/topo/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "coll_base_util.h"
#include "coll_base_functions.h"
#include <ctype.h>
int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount,
ompi_datatype_t* sdatatype,
@ -268,7 +270,7 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
} else {
scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
}
for (int i=0; i<scount; i++) {
if (NULL != stypes && NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) {
OBJ_RETAIN(stypes[i]);
@ -297,7 +299,8 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
return OMPI_SUCCESS;
}
static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) {
static void nbc_req_cons(ompi_coll_base_nbc_request_t *req)
{
req->cb.req_complete_cb = NULL;
req->req_complete_cb_data = NULL;
req->data.objs.objs[0] = NULL;
@ -305,3 +308,253 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) {
}
OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL);
/* File reading functions */
static void skiptonewline (FILE *fptr, int *fileline)
{
char val;
int rc;
do {
rc = fread(&val, 1, 1, fptr);
if (0 == rc) {
return;
}
if ('\n' == val) {
(*fileline)++;
return;
}
} while (1);
}
int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val)
{
char trash;
int rc;
do {
rc = fscanf(fptr, "%li", val);
if (rc == EOF) {
return -1;
}
if (1 == rc) {
return 0;
}
/* in all other cases, skip to the end of the token */
rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) {
return -1;
}
if ('\n' == trash) (*fileline)++;
if ('#' == trash) {
skiptonewline (fptr, fileline);
}
} while (1);
}
int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val)
{
char trash, token[32];
int rc;
*val = NULL; /* security in case we fail */
do {
rc = fscanf(fptr, "%32s", token);
if (rc == EOF) {
return -1;
}
if (1 == rc) {
if( '#' == token[0] ) {
skiptonewline(fptr, fileline);
continue;
}
*val = (char*)malloc(strlen(token) + 1);
strcpy(*val, token);
return 0;
}
/* in all other cases, skip to the end of the token */
rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) {
return -1;
}
if ('\n' == trash) (*fileline)++;
if ('#' == trash) {
skiptonewline (fptr, fileline);
}
} while (1);
}
int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val)
{
char trash;
int rc;
do {
rc = fscanf(fptr, "%" PRIsize_t, val);
if (rc == EOF) {
return -1;
}
if (1 == rc) {
return 0;
}
/* in all other cases, skip to the end of the token */
rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) {
return -1;
}
if ('\n' == trash) (*fileline)++;
if ('#' == trash) {
skiptonewline (fptr, fileline);
}
} while (1);
}
int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected)
{
char trash;
int rc;
do {
rc = fread(&trash, sizeof(char), 1, fptr);
if (0 == rc) { /* hit the end of the file */
return -1;
}
if ('\n' == trash) {
(*fileline)++;
continue;
}
if ('#' == trash) {
skiptonewline (fptr, fileline);
continue;
}
if( trash == expected )
return 1; /* return true and eat the char */
if( isblank(trash) ) /* skip all spaces if that's not what we were looking for */
continue;
if( 0 != fseek(fptr, -1, SEEK_CUR) )
return -1;
return 0;
} while (1);
}
/**
* There are certainly simpler implementation for this function when performance
* is not a critical point. But, as this function is used during the collective
* configuration, and we can do this configurations once for each communicator,
* I would rather have a more complex but faster implementation.
* The approach here is to search for the largest common denominators, to create
* something similar to a dichotomic search.
*/
int mca_coll_base_name_to_colltype(const char* name)
{
if( 'n' == name[0] ) {
if( 0 == strncmp(name, "neighbor_all", 12) ) {
if( 't' != name[12] ) {
if( 0 == strncmp(name+12, "gather", 6) ) {
if('\0' == name[18]) return NEIGHBOR_ALLGATHER;
if( 'v' == name[18]) return NEIGHBOR_ALLGATHERV;
}
} else {
if( 0 == strncmp(name+12, "toall", 5) ) {
if( '\0' == name[17] ) return NEIGHBOR_ALLTOALL;
if( 'v' == name[17] ) return NEIGHBOR_ALLTOALLV;
if( 'w' == name[17] ) return NEIGHBOR_ALLTOALLW;
}
}
}
return -1;
}
if( 'a' == name[0] ) {
if( 0 != strncmp(name, "all", 3) ) {
return -1;
}
if( 't' != name[3] ) {
if( 'r' == name[3] ) {
if( 0 == strcmp(name+3, "reduce") )
return ALLREDUCE;
} else {
if( 0 == strncmp(name+3, "gather", 6) ) {
if( '\0' == name[9] ) return ALLGATHER;
if( 'v' == name[9] ) return ALLGATHERV;
}
}
} else {
if( 0 == strncmp(name+3, "toall", 5) ) {
if( '\0' == name[8] ) return ALLTOALL;
if( 'v' == name[8] ) return ALLTOALLV;
if( 'w' == name[8] ) return ALLTOALLW;
}
}
return -1;
}
if( 'r' > name[0] ) {
if( 'b' == name[0] ) {
if( 0 == strcmp(name, "barrier") )
return BARRIER;
if( 0 == strcmp(name, "bcast") )
return BCAST;
} else if( 'g'== name[0] ) {
if( 0 == strncmp(name, "gather", 6) ) {
if( '\0' == name[6] ) return GATHER;
if( 'v' == name[6] ) return GATHERV;
}
}
if( 0 == strcmp(name, "exscan") )
return EXSCAN;
return -1;
}
if( 's' > name[0] ) {
if( 0 == strncmp(name, "reduce", 6) ) {
if( '\0' == name[6] ) return REDUCE;
if( '_' == name[6] ) {
if( 0 == strncmp(name+7, "scatter", 7) ) {
if( '\0' == name[14] ) return REDUCESCATTER;
if( 0 == strcmp(name+14, "_block") ) return REDUCESCATTERBLOCK;
}
}
}
return -1;
}
if( 0 == strcmp(name, "scan") )
return SCAN;
if( 0 == strcmp(name, "scatterv") )
return SCATTERV;
if( 0 == strcmp(name, "scatter") )
return SCATTER;
return -1;
}
/* conversion table for all COLLTYPE_T values defined in ompi/mca/coll/base/coll_base_functions.h */
static const char* colltype_translation_table[] = {
[ALLGATHER] = "allgather",
[ALLGATHERV] = "allgatherv",
[ALLREDUCE] = "allreduce",
[ALLTOALL] = "alltoall",
[ALLTOALLV] = "alltoallv",
[ALLTOALLW] = "alltoallw",
[BARRIER] = "barrier",
[BCAST] = "bcast",
[EXSCAN] = "exscan",
[GATHER] = "gather",
[GATHERV] = "gatherv",
[REDUCE] = "reduce",
[REDUCESCATTER] = "reduce_scatter",
[REDUCESCATTERBLOCK] = "reduce_scatter_block",
[SCAN] = "scan",
[SCATTER] = "scatter",
[SCATTERV] = "scatterv",
[NEIGHBOR_ALLGATHER] = "neighbor_allgather",
[NEIGHBOR_ALLGATHERV] = "neighbor_allgatherv",
[NEIGHBOR_ALLTOALL] = "neighbor_alltoall",
[NEIGHBOR_ALLTOALLV] = "neighbor_alltoallv",
[NEIGHBOR_ALLTOALLW] = "neighbor_alltoallw",
[COLLCOUNT] = NULL
};
char* mca_coll_base_colltype_to_str(int collid)
{
if( (collid < 0) || (collid >= COLLCOUNT) ) {
return NULL;
}
return strdup(colltype_translation_table[collid]);
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@ -84,6 +84,19 @@ ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve)
typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t;
/*
* Structure to store an available module
*/
struct mca_coll_base_avail_coll_t {
opal_list_item_t super;
int ac_priority;
mca_coll_base_module_t *ac_module;
const char * ac_component_name;
};
typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t);
/**
* A MPI_like function doing a send and a receive simultaneously.
* If one of the communications results in a zero-byte message the
@ -164,5 +177,18 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
ompi_datatype_t * const stypes[],
ompi_datatype_t * const rtypes[]);
/* File reading function */
int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val);
int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val);
int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val);
/* peek at the next valid token to see if it begins with the expected value. If yes
* eat the value, otherwise put it back into the file.
*/
int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected);
/* Miscelaneous function */
char* mca_coll_base_colltype_to_str(int collid);
int mca_coll_base_name_to_colltype(const char* name);
END_C_DECLS
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

Просмотреть файл

@ -19,6 +19,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -767,6 +768,9 @@ struct mca_coll_base_comm_coll_t {
mca_coll_base_module_reduce_local_fn_t coll_reduce_local;
mca_coll_base_module_2_3_0_t *coll_reduce_local_module;
/* List of modules initialized, queried and enabled */
opal_list_t *module_list;
};
typedef struct mca_coll_base_comm_coll_t mca_coll_base_comm_coll_t;

54
ompi/mca/coll/han/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,54 @@
#
# Copyright (c) 2018-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
coll_han.h \
coll_han_trigger.h \
coll_han_dynamic.h \
coll_han_dynamic_file.h \
coll_han_bcast.c \
coll_han_reduce.c \
coll_han_scatter.c \
coll_han_gather.c \
coll_han_allreduce.c \
coll_han_allgather.c \
coll_han_component.c \
coll_han_module.c \
coll_han_trigger.c \
coll_han_dynamic.c \
coll_han_dynamic_file.c \
coll_han_topo.c \
coll_han_subcomms.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
component_noinst =
component_install =
if MCA_BUILD_ompi_coll_han_DSO
component_install += mca_coll_han.la
else
component_noinst += libmca_coll_han.la
endif
# See ompi/mca/btl/sm/Makefile.am for an explanation of
# libmca_common_sm.la.
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_han_la_SOURCES = $(sources)
mca_coll_han_la_LDFLAGS = -module -avoid-version
mca_coll_han_la_LIBADD =
noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_han_la_SOURCES =$(sources)
libmca_coll_han_la_LDFLAGS = -module -avoid-version

539
ompi/mca/coll/han/coll_han.h Обычный файл
Просмотреть файл

@ -0,0 +1,539 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_HAN_EXPORT_H
#define MCA_COLL_HAN_EXPORT_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/util/output.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_han_trigger.h"
#include "ompi/mca/coll/han/coll_han_dynamic.h"
/*
* Today;
* . only 2 modules available for intranode (low) level
* . only 2 modules available for internode (up) level
*/
#define COLL_HAN_LOW_MODULES 2
#define COLL_HAN_UP_MODULES 2
struct mca_coll_han_bcast_args_s {
mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
void *buff;
ompi_datatype_t *dtype;
int seg_count;
int root_low_rank;
int root_up_rank;
int num_segments;
int cur_seg;
int w_rank;
int last_seg_count;
bool noop;
};
typedef struct mca_coll_han_bcast_args_s mca_coll_han_bcast_args_t;
struct mca_coll_han_reduce_args_s {
mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
void *sbuf;
void *rbuf;
ompi_op_t *op;
ompi_datatype_t *dtype;
int seg_count;
int root_low_rank;
int root_up_rank;
int num_segments;
int cur_seg;
int w_rank;
int last_seg_count;
bool noop;
bool is_tmp_rbuf;
};
typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t;
struct mca_coll_han_allreduce_args_s {
mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf;
void *rbuf;
ompi_op_t *op;
ompi_datatype_t *dtype;
int seg_count;
int root_up_rank;
int root_low_rank;
int num_segments;
int cur_seg;
int w_rank;
int last_seg_count;
bool noop;
int *completed;
};
typedef struct mca_coll_han_allreduce_args_s mca_coll_han_allreduce_args_t;
struct mca_coll_han_scatter_args_s {
mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf;
void *sbuf_inter_free;
void *sbuf_reorder_free;
void *rbuf;
ompi_datatype_t *sdtype;
ompi_datatype_t *rdtype;
int scount;
int rcount;
int root;
int root_up_rank;
int root_low_rank;
int w_rank;
bool noop;
};
typedef struct mca_coll_han_scatter_args_s mca_coll_han_scatter_args_t;
struct mca_coll_han_gather_args_s {
mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf;
void *sbuf_inter_free;
void *rbuf;
ompi_datatype_t *sdtype;
ompi_datatype_t *rdtype;
int scount;
int rcount;
int root;
int root_up_rank;
int root_low_rank;
int w_rank;
bool noop;
bool is_mapbycore;
};
typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t;
struct mca_coll_han_allgather_s {
mca_coll_task_t *cur_task;
ompi_communicator_t *up_comm;
ompi_communicator_t *low_comm;
ompi_request_t *req;
void *sbuf;
void *sbuf_inter_free;
void *rbuf;
ompi_datatype_t *sdtype;
ompi_datatype_t *rdtype;
int scount;
int rcount;
int root_low_rank;
int w_rank;
bool noop;
bool is_mapbycore;
int *topo;
};
typedef struct mca_coll_han_allgather_s mca_coll_han_allgather_t;
/**
* Structure to hold the han coll component. First it holds the
* base coll component, and then holds a bunch of
* han-coll-component-specific stuff (e.g., current MCA param
* values).
*/
typedef struct mca_coll_han_component_t {
/** Base coll component */
mca_coll_base_component_2_0_0_t super;
/** MCA parameter: Priority of this component */
int han_priority;
/* whether output the log message */
int han_output;
/* segment size for bcast */
uint32_t han_bcast_segsize;
/* up level module for bcast */
uint32_t han_bcast_up_module;
/* low level module for bcast */
uint32_t han_bcast_low_module;
/* segment size for reduce */
uint32_t han_reduce_segsize;
/* up level module for reduce */
uint32_t han_reduce_up_module;
/* low level module for reduce */
uint32_t han_reduce_low_module;
/* segment size for allreduce */
uint32_t han_allreduce_segsize;
/* up level module for allreduce */
uint32_t han_allreduce_up_module;
/* low level module for allreduce */
uint32_t han_allreduce_low_module;
/* up level module for allgather */
uint32_t han_allgather_up_module;
/* low level module for allgather */
uint32_t han_allgather_low_module;
/* up level module for gather */
uint32_t han_gather_up_module;
/* low level module for gather */
uint32_t han_gather_low_module;
/* up level module for scatter */
uint32_t han_scatter_up_module;
/* low level module for scatter */
uint32_t han_scatter_low_module;
/* whether we need reproducible results
* (but disables topological optimisations)
*/
uint32_t han_reproducible;
bool use_simple_algorithm[COLLCOUNT];
/* Dynamic configuration rules */
bool use_dynamic_file_rules;
bool dump_dynamic_rules;
char* dynamic_rules_filename;
/* Dynamic rules from file */
mca_coll_han_dynamic_rules_t dynamic_rules;
/* Dynamic rules from mca parameter */
COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL];
/* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */
int max_dynamic_errors;
} mca_coll_han_component_t;
typedef void (*previous_dummy_fn_t) (void);
/*
* Structure used to store what is necessary for the collective operations
* routines in case of fallback.
*/
typedef struct mca_coll_han_single_collective_fallback_s {
union {
mca_coll_base_module_allgather_fn_t allgather;
mca_coll_base_module_allgatherv_fn_t allgatherv;
mca_coll_base_module_allreduce_fn_t allreduce;
mca_coll_base_module_bcast_fn_t bcast;
mca_coll_base_module_gather_fn_t gather;
mca_coll_base_module_reduce_fn_t reduce;
mca_coll_base_module_scatter_fn_t scatter;
previous_dummy_fn_t dummy;
};
mca_coll_base_module_t* module;
} mca_coll_han_single_collective_fallback_t;
/*
* The structure containing a replacement for all collective supported
* by HAN. This structure is used as a fallback during subcommunicator
* creation.
*/
typedef struct mca_coll_han_collectives_fallback_s {
mca_coll_han_single_collective_fallback_t allgather;
mca_coll_han_single_collective_fallback_t allgatherv;
mca_coll_han_single_collective_fallback_t allreduce;
mca_coll_han_single_collective_fallback_t bcast;
mca_coll_han_single_collective_fallback_t reduce;
mca_coll_han_single_collective_fallback_t gather;
mca_coll_han_single_collective_fallback_t scatter;
} mca_coll_han_collectives_fallback_t;
/** Coll han module */
typedef struct mca_coll_han_module_t {
/** Base module */
mca_coll_base_module_t super;
/* Whether this module has been lazily initialized or not yet */
bool enabled;
struct ompi_communicator_t **cached_low_comms;
struct ompi_communicator_t **cached_up_comms;
int *cached_vranks;
int *cached_topo;
bool is_mapbycore;
bool are_ppn_imbalanced;
/* To be able to fallback when the cases are not supported */
struct mca_coll_han_collectives_fallback_s fallback;
/* To be able to fallback on reproducible algorithm */
mca_coll_base_module_reduce_fn_t reproducible_reduce;
mca_coll_base_module_t *reproducible_reduce_module;
mca_coll_base_module_allreduce_fn_t reproducible_allreduce;
mca_coll_base_module_t *reproducible_allreduce_module;
/* Topological level of this communicator */
TOPO_LVL_T topologic_level;
/* Collective module storage for module choice */
mca_coll_han_collective_modules_storage_t modules_storage;
bool storage_initialized;
/*
* Number of dynamic errors encountered
* The first mca_coll_han_component.max_dynamic_errors
* of rank 0 are printed with verbosity = 0
*/
int dynamic_errors;
/* Sub-communicator */
struct ompi_communicator_t *sub_comm[NB_TOPO_LVL];
} mca_coll_han_module_t;
OBJ_CLASS_DECLARATION(mca_coll_han_module_t);
/*
* Some defines to stick to the naming used in the other components in terms of
* fallback routines
*/
#define previous_allgather fallback.allgather.allgather
#define previous_allgather_module fallback.allgather.module
#define previous_allgatherv fallback.allgatherv.allgatherv
#define previous_allgatherv_module fallback.allgatherv.module
#define previous_allreduce fallback.allreduce.allreduce
#define previous_allreduce_module fallback.allreduce.module
#define previous_bcast fallback.bcast.bcast
#define previous_bcast_module fallback.bcast.module
#define previous_reduce fallback.reduce.reduce
#define previous_reduce_module fallback.reduce.module
#define previous_gather fallback.gather.gather
#define previous_gather_module fallback.gather.module
#define previous_scatter fallback.scatter.scatter
#define previous_scatter_module fallback.scatter.module
/* macro to correctly load a fallback collective module */
#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \
do { \
if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \
(COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \
mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \
OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \
OBJ_RELEASE(coll_module); \
} \
} while(0)
/* macro to correctly load /all/ fallback collectives */
#define HAN_LOAD_FALLBACK_COLLECTIVES(HANM, COMM) \
do { \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, bcast); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, scatter); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, gather); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, reduce); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allreduce); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgather); \
HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgatherv); \
han_module->enabled = false; /* entire module set to pass-through from now on */ \
} while(0)
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC extern mca_coll_han_component_t mca_coll_han_component;
/*
* coll module functions
*/
int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads);
mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm, int *priority);
int han_request_free(ompi_request_t ** request);
/* Subcommunicator creation */
int mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module);
int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module);
/**
* Gather topology information
*
* Returns a pointer to the (potentially already cached) topology.
* NOTE: if the rank distribution is imbalanced, no effort will be made to gather
* the topology at all ranks and instead NULL is returned and han_module->is_mapbycore
* is set to false.
* If HAN ever learns to deal with imbalanced topologies, this needs fixing!
*/
int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module,
int num_topo_level);
/* Utils */
static inline void
mca_coll_han_get_ranks(int *vranks, int root, int low_size,
int *root_low_rank, int *root_up_rank)
{
*root_up_rank = vranks[root] / low_size;
*root_low_rank = vranks[root] % low_size;
}
const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl);
/** Dynamic component choice */
/*
* Get all the collective modules initialized on this communicator
* This function must be call at the start of every selector implementation
*/
int
mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module);
int
mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_allgatherv_intra_dynamic(ALLGATHERV_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_allreduce_intra_dynamic(ALLREDUCE_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_bcast_intra_dynamic(BCAST_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_gather_intra_dynamic(GATHER_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_reduce_intra_dynamic(REDUCE_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS,
mca_coll_base_module_t *module);
/* Bcast */
int mca_coll_han_bcast_intra_simple(void *buff,
int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
/* Reduce */
int
mca_coll_han_reduce_intra_simple(const void *sbuf,
void* rbuf,
int count,
struct ompi_datatype_t *dtype,
ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_reduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_han_reduce_intra(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
ompi_op_t* op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t * module);
/* Allreduce */
int
mca_coll_han_allreduce_intra_simple(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_allreduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_han_allreduce_intra(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
/* Scatter */
int
mca_coll_han_scatter_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
/* Gather */
int
mca_coll_han_gather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int
mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
/* reordering after gather, for unordered ranks */
void
ompi_coll_han_reorder_gather(const void *sbuf,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
int * topo);
/* Allgather */
int
mca_coll_han_allgather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module);
int
mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
#endif /* MCA_COLL_HAN_EXPORT_H */

420
ompi/mca/coll/han/coll_han_allgather.c Обычный файл
Просмотреть файл

@ -0,0 +1,420 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h"
static int mca_coll_han_allgather_lb_task(void *task_args);
static int mca_coll_han_allgather_lg_task(void *task_args);
static int mca_coll_han_allgather_uag_task(void *task_args);
static inline void
mca_coll_han_set_allgather_args(mca_coll_han_allgather_t * args,
mca_coll_task_t * cur_task,
void *sbuf,
void *sbuf_inter_free,
int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
struct ompi_datatype_t *rdtype,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank,
bool noop,
bool is_mapbycore,
int *topo,
ompi_request_t * req)
{
args->cur_task = cur_task;
args->sbuf = sbuf;
args->sbuf_inter_free = sbuf_inter_free;
args->scount = scount;
args->sdtype = sdtype;
args->rbuf = rbuf;
args->rcount = rcount;
args->rdtype = rdtype;
args->root_low_rank = root_low_rank;
args->up_comm = up_comm;
args->low_comm = low_comm;
args->w_rank = w_rank;
args->noop = noop;
args->is_mapbycore = is_mapbycore;
args->topo = topo;
args->req = req;
}
int
mca_coll_han_allgather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t * module)
{
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather within this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
int low_rank = ompi_comm_rank(low_comm);
int w_rank = ompi_comm_rank(comm);
/* Init topo */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* unbalanced case needs algo adaptation */
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather with this communicator (imbalance). Fall back on another component\n"));
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
ompi_request_t *temp_request = NULL;
/* Set up request */
temp_request = OBJ_NEW(ompi_request_t);
temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = han_request_free;
temp_request->req_status = (ompi_status_public_t){0};
temp_request->req_complete = REQUEST_PENDING;
int root_low_rank = 0;
/* Create lg (lower level gather) task */
mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t);
/* Setup lg task arguments */
mca_coll_han_allgather_t *lg_args = malloc(sizeof(mca_coll_han_allgather_t));
mca_coll_han_set_allgather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount,
rdtype, root_low_rank, up_comm, low_comm, w_rank,
low_rank != root_low_rank, han_module->is_mapbycore, topo,
temp_request);
/* Init and issue lg task */
init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_args));
issue_task(lg);
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
return OMPI_SUCCESS;
}
/* lg: lower level gather task */
int mca_coll_han_allgather_lg_task(void *task_args)
{
mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args;
char *tmp_buf = NULL, *tmp_rbuf = NULL;
char *tmp_send = NULL;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n",
t->w_rank));
/* If the process is one of the node leader */
ptrdiff_t rlb, rext;
ompi_datatype_get_extent (t->rdtype, &rlb, &rext);
if (MPI_IN_PLACE == t->sbuf) {
t->sdtype = t->rdtype;
t->scount = t->rcount;
}
if (!t->noop) {
int low_size = ompi_comm_size(t->low_comm);
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap);
tmp_buf = (char *) malloc(rsize);
tmp_rbuf = tmp_buf - rgap;
if (MPI_IN_PLACE == t->sbuf) {
tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext;
ompi_datatype_copy_content_same_ddt(t->rdtype, t->rcount, tmp_rbuf, tmp_send);
}
}
/* Lower level (shared memory or intra-node) gather */
if (MPI_IN_PLACE == t->sbuf) {
if (!t->noop) {
t->low_comm->c_coll->coll_gather(MPI_IN_PLACE, t->scount, t->sdtype,
tmp_rbuf, t->rcount, t->rdtype, t->root_low_rank,
t->low_comm, t->low_comm->c_coll->coll_gather_module);
}
else {
tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext;
t->low_comm->c_coll->coll_gather(tmp_send, t->rcount, t->rdtype,
NULL, t->rcount, t->rdtype, t->root_low_rank,
t->low_comm, t->low_comm->c_coll->coll_gather_module);
}
}
else {
t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount,
t->rdtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_gather_module);
}
t->sbuf = tmp_rbuf;
t->sbuf_inter_free = tmp_buf;
/* Create uag (upper level all-gather) task */
mca_coll_task_t *uag = t->cur_task;
/* Init and issue uag task */
init_task(uag, mca_coll_han_allgather_uag_task, (void *) t);
issue_task(uag);
return OMPI_SUCCESS;
}
/* uag: upper level (inter-node) all-gather task */
int mca_coll_han_allgather_uag_task(void *task_args)
{
mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args;
if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allgather: uag noop\n", t->w_rank));
} else {
int low_size = ompi_comm_size(t->low_comm);
int up_size = ompi_comm_size(t->up_comm);
char *reorder_buf = NULL;
char *reorder_rbuf = NULL;
if (t->is_mapbycore) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: HAN Allgather is bycore: ", t->w_rank));
reorder_rbuf = (char *) t->rbuf;
} else {
ptrdiff_t rsize, rgap = 0;
rsize =
opal_datatype_span(&t->rdtype->super,
(int64_t) t->rcount * low_size * up_size,
&rgap);
reorder_buf = (char *) malloc(rsize);
reorder_rbuf = reorder_buf - rgap;
}
/* Inter node allgather */
t->up_comm->c_coll->coll_allgather((char *) t->sbuf, t->scount * low_size, t->sdtype,
reorder_rbuf, t->rcount * low_size, t->rdtype,
t->up_comm, t->up_comm->c_coll->coll_allgather_module);
if (t->sbuf_inter_free != NULL) {
free(t->sbuf_inter_free);
t->sbuf_inter_free = NULL;
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allgather: ug allgather finish\n", t->w_rank));
/* Reorder the node leader's rbuf, copy data from tmp_rbuf to rbuf */
if (!t->is_mapbycore) {
int i, j;
ptrdiff_t rextent;
ompi_datatype_type_extent(t->rdtype, &rextent);
for (i = 0; i < up_size; i++) {
for (j = 0; j < low_size; j++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: HAN Allgather copy from %d %d\n", t->w_rank,
(i * low_size + j) * 2 + 1,
t->topo[(i * low_size + j) * 2 + 1]));
ompi_datatype_copy_content_same_ddt(t->rdtype,
(ptrdiff_t) t->rcount,
(char *) t->rbuf +
rextent *
(ptrdiff_t) t->topo[(i * low_size + j) * 2 +
1] *
(ptrdiff_t) t->rcount,
reorder_rbuf + rextent * (i * low_size +
j) *
(ptrdiff_t) t->rcount);
}
}
free(reorder_buf);
reorder_buf = NULL;
}
}
/* Create lb (low level broadcast) task */
mca_coll_task_t *lb = t->cur_task;
/* Init and issue lb task */
init_task(lb, mca_coll_han_allgather_lb_task, (void *) t);
issue_task(lb);
return OMPI_SUCCESS;
}
/* lb: low level broadcast task */
int mca_coll_han_allgather_lb_task(void *task_args)
{
mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n",
t->w_rank));
OBJ_RELEASE(t->cur_task);
int low_size = ompi_comm_size(t->low_comm);
int up_size = ompi_comm_size(t->up_comm);
t->low_comm->c_coll->coll_bcast((char *) t->rbuf, t->rcount * low_size * up_size, t->rdtype,
t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_bcast_module);
ompi_request_t *temp_req = t->req;
free(t);
ompi_request_complete(temp_req, 1);
return OMPI_SUCCESS;
}
int
mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module){
/* create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather within this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
/* discovery topology */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* unbalanced case needs algo adaptation */
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather within this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather);
return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
comm, comm->c_coll->coll_allgather_module);
}
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
int w_rank = ompi_comm_rank(comm);
/* setup up/low coordinates */
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int up_rank = ompi_comm_rank(up_comm);
int up_size = ompi_comm_size(up_comm);
int root_low_rank = 0; // node leader will be 0 on each rank
/* allocate the intermediary buffer
* to gather on leaders on the low sub communicator */
ptrdiff_t rlb, rext;
ompi_datatype_get_extent (rdtype, &rlb, &rext);
char *tmp_buf = NULL;
char *tmp_buf_start = NULL;
char *tmp_send = NULL;
if (MPI_IN_PLACE == sbuf) {
scount = rcount;
sdtype = rdtype;
}
if (low_rank == root_low_rank) {
ptrdiff_t rsize, rgap = 0;
/* Compute the size to receive all the local data, including datatypes empty gaps */
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap);
/* intermediary buffer on node leaders to gather on low comm */
tmp_buf = (char *) malloc(rsize);
tmp_buf_start = tmp_buf - rgap;
if (MPI_IN_PLACE == sbuf) {
tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext;
ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send);
}
}
/* 1. low gather on node leaders into tmp_buf */
if (MPI_IN_PLACE == sbuf) {
if (low_rank == root_low_rank) {
low_comm->c_coll->coll_gather(MPI_IN_PLACE, scount, sdtype,
tmp_buf_start, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
}
else {
tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext;
low_comm->c_coll->coll_gather(tmp_send, rcount, rdtype,
NULL, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
}
}
else {
low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype,
tmp_buf_start, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
}
/* 2. allgather between node leaders, from tmp_buf to reorder_buf */
if (low_rank == root_low_rank) {
/* allocate buffer to store unordered result on node leaders
* if the processes are mapped-by core, no need to reorder:
* distribution of ranks on core first and node next,
* in a increasing order for both patterns.
*/
char *reorder_buf = NULL;
char *reorder_buf_start = NULL;
if (han_module->is_mapbycore) {
reorder_buf_start = rbuf;
} else {
if (0 == low_rank && 0 == up_rank) { // first rank displays message
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Allgather needs reordering: ", up_rank));
}
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap);
reorder_buf = (char *) malloc(rsize);
reorder_buf_start = reorder_buf - rgap;
}
/* 2a. inter node allgather */
up_comm->c_coll->coll_allgather(tmp_buf_start, scount*low_size, sdtype,
reorder_buf_start, rcount*low_size, rdtype,
up_comm, up_comm->c_coll->coll_allgather_module);
if (tmp_buf != NULL) {
free(tmp_buf);
tmp_buf = NULL;
tmp_buf_start = NULL;
}
/* 2b. reorder the node leader's into rbuf.
* if ranks are not mapped in topological order, data needs to be reordered
* (see reorder_gather)
*/
if (!han_module->is_mapbycore) {
ompi_coll_han_reorder_gather(reorder_buf_start,
rbuf, rcount, rdtype,
comm, topo);
free(reorder_buf);
reorder_buf = NULL;
}
}
/* 3. up broadcast: leaders broadcast on their nodes */
low_comm->c_coll->coll_bcast(rbuf, rcount*low_size*up_size, rdtype,
root_low_rank, low_comm,
low_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS;
}

558
ompi/mca/coll/han/coll_han_allreduce.c Обычный файл
Просмотреть файл

@ -0,0 +1,558 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h"
static int mca_coll_han_allreduce_t0_task(void *task_args);
static int mca_coll_han_allreduce_t1_task(void *task_args);
static int mca_coll_han_allreduce_t2_task(void *task_args);
static int mca_coll_han_allreduce_t3_task(void *task_args);
/* Only work with regular situation (each node has equal number of processes) */
static inline void
mca_coll_han_set_allreduce_args(mca_coll_han_allreduce_args_t * args,
mca_coll_task_t * cur_task,
void *sbuf,
void *rbuf,
int seg_count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root_up_rank,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments,
int cur_seg,
int w_rank,
int last_seg_count,
bool noop, ompi_request_t * req, int *completed)
{
args->cur_task = cur_task;
args->sbuf = sbuf;
args->rbuf = rbuf;
args->seg_count = seg_count;
args->dtype = dtype;
args->op = op;
args->root_up_rank = root_up_rank;
args->root_low_rank = root_low_rank;
args->up_comm = up_comm;
args->low_comm = low_comm;
args->num_segments = num_segments;
args->cur_seg = cur_seg;
args->w_rank = w_rank;
args->last_seg_count = last_seg_count;
args->noop = noop;
args->req = req;
args->completed = completed;
}
/*
* Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce:
* lr: lower level (shared-memory or intra-node) reduce,
* ur: upper level (inter-node) reduce,
* ub: upper level (inter-node) bcast,
* lb: lower level (shared-memory or intra-node) bcast.
* Hence, in each iteration, there is a combination of collective operations which is called a task.
* | seg 0 | seg 1 | seg 2 | seg 3 |
* iter 0 | lr | | | | task: t0, contains lr
* iter 1 | ur | lr | | | task: t1, contains ur and lr
* iter 2 | ub | ur | lr | | task: t2, contains ub, ur and lr
* iter 3 | lb | ub | ur | lr | task: t3, contains lb, ub, ur and lr
* iter 4 | | lb | ub | ur | task: t3, contains lb, ub and ur
* iter 5 | | | lb | ub | task: t3, contains lb and ub
* iter 6 | | | | lb | task: t3, contains lb
*/
int
mca_coll_han_allreduce_intra(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* No support for non-commutative operations */
if(!ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this operation. Fall back on another component\n"));
goto prev_allreduce_intra;
}
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
comm, comm->c_coll->coll_reduce_module);
}
ptrdiff_t extent, lb;
size_t dtype_size;
ompi_datatype_get_extent(dtype, &lb, &extent);
int seg_count = count, w_rank;
w_rank = ompi_comm_rank(comm);
ompi_datatype_type_size(dtype, &dtype_size);
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
/* use MCA parameters for now */
low_comm = han_module->cached_low_comms[mca_coll_han_component.han_allreduce_low_module];
up_comm = han_module->cached_up_comms[mca_coll_han_component.han_allreduce_up_module];
COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, dtype_size,
seg_count);
/* Determine number of elements sent per task. */
OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output,
"In HAN Allreduce seg_size %d seg_count %d count %d\n",
mca_coll_han_component.han_allreduce_segsize, seg_count, count));
int num_segments = (count + seg_count - 1) / seg_count;
int low_rank = ompi_comm_rank(low_comm);
int root_up_rank = 0;
int root_low_rank = 0;
/* Create t0 task for the first segment */
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
/* Setup up t0 task arguments */
int *completed = (int *) malloc(sizeof(int));
completed[0] = 0;
mca_coll_han_allreduce_args_t *t = malloc(sizeof(mca_coll_han_allreduce_args_t));
mca_coll_han_set_allreduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op,
root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0,
w_rank, count - (num_segments - 1) * seg_count,
low_rank != root_low_rank, NULL, completed);
/* Init t0 task */
init_task(t0, mca_coll_han_allreduce_t0_task, (void *) (t));
/* Issure t0 task */
issue_task(t0);
/* Create t1 tasks for the current segment */
mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t);
/* Setup up t1 task arguments */
t->cur_task = t1;
/* Init t1 task */
init_task(t1, mca_coll_han_allreduce_t1_task, (void *) t);
/* Issue t1 task */
issue_task(t1);
/* Create t2 tasks for the current segment */
mca_coll_task_t *t2 = OBJ_NEW(mca_coll_task_t);
/* Setup up t2 task arguments */
t->cur_task = t2;
/* Init t2 task */
init_task(t2, mca_coll_han_allreduce_t2_task, (void *) t);
issue_task(t2);
/* Create t3 tasks for the current segment */
mca_coll_task_t *t3 = OBJ_NEW(mca_coll_task_t);
/* Setup up t3 task arguments */
t->cur_task = t3;
/* Init t3 task */
init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t);
issue_task(t3);
while (t->completed[0] != t->num_segments) {
/* Create t3 tasks for the current segment */
mca_coll_task_t *t3 = OBJ_NEW(mca_coll_task_t);
/* Setup up t3 task arguments */
t->cur_task = t3;
t->sbuf = (char *) t->sbuf + extent * t->seg_count;
t->rbuf = (char *) t->rbuf + extent * t->seg_count;
t->cur_seg = t->cur_seg + 1;
/* Init t3 task */
init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t);
issue_task(t3);
}
free(t->completed);
t->completed = NULL;
free(t);
return OMPI_SUCCESS;
prev_allreduce_intra:
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
comm, han_module->previous_allreduce_module);
}
/* t0 task */
int mca_coll_han_allreduce_t0_task(void *task_args)
{
mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0]));
OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
if (MPI_IN_PLACE == t->sbuf) {
if (!t->noop) {
t->low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
else {
t->low_comm->c_coll->coll_reduce((char *) t->rbuf, NULL, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
}
else {
t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
return OMPI_SUCCESS;
}
/* t1 task */
int mca_coll_han_allreduce_t1_task(void *task_args)
{
mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0]));
OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *ireduce_req;
int tmp_count = t->seg_count;
if (!t->noop) {
int up_rank = ompi_comm_rank(t->up_comm);
/* ur of cur_seg */
if (up_rank == t->root_up_rank) {
t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype,
t->op, t->root_up_rank, t->up_comm, &ireduce_req,
t->up_comm->c_coll->coll_ireduce_module);
} else {
t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count,
t->dtype, t->op, t->root_up_rank, t->up_comm,
&ireduce_req, t->up_comm->c_coll->coll_ireduce_module);
}
}
/* lr of cur_seg+1 */
if (t->cur_seg <= t->num_segments - 2) {
if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count,
(char *) t->rbuf + extent * t->seg_count, tmp_count,
t->dtype, t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
if (!t->noop) {
ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE);
}
return OMPI_SUCCESS;
}
/* t2 task */
int mca_coll_han_allreduce_t2_task(void *task_args)
{
mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0]));
OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *reqs[2];
int req_count = 0;
int tmp_count = t->seg_count;
if (!t->noop) {
int up_rank = ompi_comm_rank(t->up_comm);
/* ub of cur_seg */
t->up_comm->c_coll->coll_ibcast((char *) t->rbuf, t->seg_count, t->dtype, t->root_up_rank,
t->up_comm, &(reqs[0]),
t->up_comm->c_coll->coll_ibcast_module);
req_count++;
/* ur of cur_seg+1 */
if (t->cur_seg <= t->num_segments - 2) {
if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
if (up_rank == t->root_up_rank) {
t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE,
(char *) t->rbuf + extent * t->seg_count,
tmp_count, t->dtype, t->op, t->root_up_rank,
t->up_comm, &(reqs[1]),
t->up_comm->c_coll->coll_ireduce_module);
} else {
t->up_comm->c_coll->coll_ireduce((char *) t->rbuf + extent * t->seg_count,
(char *) t->rbuf + extent * t->seg_count,
tmp_count, t->dtype, t->op, t->root_up_rank,
t->up_comm, &(reqs[1]),
t->up_comm->c_coll->coll_ireduce_module);
}
req_count++;
}
}
/* lr of cur_seg+2 */
if (t->cur_seg <= t->num_segments - 3) {
if (t->cur_seg == t->num_segments - 3 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
t->low_comm->c_coll->coll_reduce((char *) t->sbuf + 2 * extent * t->seg_count,
(char *) t->rbuf + 2 * extent * t->seg_count, tmp_count,
t->dtype, t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
if (!t->noop && req_count > 0) {
ompi_request_wait_all(req_count, reqs, MPI_STATUSES_IGNORE);
}
return OMPI_SUCCESS;
}
/* t3 task */
int mca_coll_han_allreduce_t3_task(void *task_args)
{
mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg,
((int *) t->rbuf)[0]));
OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *reqs[2];
int req_count = 0;
int tmp_count = t->seg_count;
if (!t->noop) {
int up_rank = ompi_comm_rank(t->up_comm);
/* ub of cur_seg+1 */
if (t->cur_seg <= t->num_segments - 2) {
if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
t->up_comm->c_coll->coll_ibcast((char *) t->rbuf + extent * t->seg_count, t->seg_count,
t->dtype, t->root_up_rank, t->up_comm, &(reqs[0]),
t->up_comm->c_coll->coll_ibcast_module);
req_count++;
}
/* ur of cur_seg+2 */
if (t->cur_seg <= t->num_segments - 3) {
if (t->cur_seg == t->num_segments - 3 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
if (up_rank == t->root_up_rank) {
t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE,
(char *) t->rbuf + 2 * extent * t->seg_count,
tmp_count, t->dtype, t->op, t->root_up_rank,
t->up_comm, &(reqs[1]),
t->up_comm->c_coll->coll_ireduce_module);
} else {
t->up_comm->c_coll->coll_ireduce((char *) t->rbuf + 2 * extent * t->seg_count,
(char *) t->rbuf + 2 * extent * t->seg_count,
tmp_count, t->dtype, t->op, t->root_up_rank,
t->up_comm, &(reqs[1]),
t->up_comm->c_coll->coll_ireduce_module);
}
req_count++;
}
}
/* lr of cur_seg+3 */
if (t->cur_seg <= t->num_segments - 4) {
if (t->cur_seg == t->num_segments - 4 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
t->low_comm->c_coll->coll_reduce((char *) t->sbuf + 3 * extent * t->seg_count,
(char *) t->rbuf + 3 * extent * t->seg_count, tmp_count,
t->dtype, t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
/* lb of cur_seg */
t->low_comm->c_coll->coll_bcast((char *) t->rbuf, t->seg_count, t->dtype, t->root_low_rank,
t->low_comm, t->low_comm->c_coll->coll_bcast_module);
if (!t->noop && req_count > 0) {
ompi_request_wait_all(req_count, reqs, MPI_STATUSES_IGNORE);
}
t->completed[0]++;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] HAN Allreduce: t3 %d total %d\n", t->w_rank, t->cur_seg,
t->completed[0]));
return OMPI_SUCCESS;
}
int
mca_coll_han_allreduce_intra_simple(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
int root_low_rank = 0;
int low_rank;
int ret;
mca_coll_han_component_t *cs = &mca_coll_han_component;
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
OPAL_OUTPUT_VERBOSE((10, cs->han_output,
"[OMPI][han] in mca_coll_han_reduce_intra_simple\n"));
// Fallback to another component if the op cannot commute
if (! ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this operation. Fall back on another component\n"));
goto prev_allreduce;
}
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op,
comm, comm->c_coll->coll_reduce_module);
}
low_comm = han_module->sub_comm[INTRA_NODE];
up_comm = han_module->sub_comm[INTER_NODE];
low_rank = ompi_comm_rank(low_comm);
/* Low_comm reduce */
if (MPI_IN_PLACE == sbuf) {
if (low_rank == root_low_rank) {
ret = low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)rbuf,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
}
else {
ret = low_comm->c_coll->coll_reduce((char *)rbuf, NULL,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
}
}
else {
ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: low comm reduce failed. "
"Falling back to another component\n"));
goto prev_allreduce;
}
/* Local roots perform a allreduce on the upper comm */
if (low_rank == root_low_rank) {
ret = up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, rbuf, count, dtype, op,
up_comm, up_comm->c_coll->coll_allreduce_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: up comm allreduce failed. \n"));
/*
* Do not fallback in such a case: only root_low_ranks follow this
* path, the other ranks are in another collective.
* ==> Falling back would potentially lead to a hang.
* Simply return the error
*/
return ret;
}
}
/* Low_comm bcast */
ret = low_comm->c_coll->coll_bcast(rbuf, count, dtype,
root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: low comm bcast failed. "
"Falling back to another component\n"));
goto prev_allreduce;
}
return OMPI_SUCCESS;
prev_allreduce:
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
comm, han_module->previous_allreduce_module);
}
/* Find a fallback on reproducible algorithm
* use tuned, or if impossible whatever available
*/
int
mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank = ompi_comm_rank(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* populate previous modules_storage*/
mca_coll_han_get_all_coll_modules(comm, han_module);
/* try availability of reproducible modules*/
int fallbacks[] = {TUNED, BASIC};
int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks);
int i;
for (i=0; i<fallbacks_len; i++) {
int fallback = fallbacks[i];
mca_coll_base_module_t *fallback_module
= han_module->modules_storage.modules[fallback].module_handler;
if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) {
if (0 == w_rank) {
opal_output_verbose(30, mca_coll_han_component.han_output,
"coll:han:allreduce_reproducible: "
"fallback on %s\n",
available_components[fallback].component_name);
}
han_module->reproducible_allreduce_module = fallback_module;
han_module->reproducible_allreduce = fallback_module->coll_allreduce;
return OMPI_SUCCESS;
}
}
/* fallback of the fallback */
if (0 == w_rank) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:allreduce_reproducible_decision: "
"no reproducible fallback\n");
}
han_module->reproducible_allreduce_module = han_module->previous_allreduce_module;
han_module->reproducible_allreduce = han_module->previous_allreduce;
return OMPI_SUCCESS;
}
/* Fallback on reproducible algorithm */
int
mca_coll_han_allreduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
return han_module->reproducible_allreduce(sbuf, rbuf, count, dtype,
op, comm,
han_module
->reproducible_allreduce_module);
}

281
ompi/mca/coll/han/coll_han_bcast.c Обычный файл
Просмотреть файл

@ -0,0 +1,281 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h"
static int mca_coll_han_bcast_t0_task(void *task_args);
static int mca_coll_han_bcast_t1_task(void *task_args);
static inline void
mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t * cur_task, void *buff,
int seg_count, struct ompi_datatype_t *dtype,
int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments, int cur_seg, int w_rank, int last_seg_count,
bool noop)
{
args->cur_task = cur_task;
args->buff = buff;
args->seg_count = seg_count;
args->dtype = dtype;
args->root_low_rank = root_low_rank;
args->root_up_rank = root_up_rank;
args->up_comm = up_comm;
args->low_comm = low_comm;
args->num_segments = num_segments;
args->cur_seg = cur_seg;
args->w_rank = w_rank;
args->last_seg_count = last_seg_count;
args->noop = noop;
}
/*
* Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast:
* ub: upper level (inter-node) bcast
* lb: low level (shared-memory or intra-node) bcast.
* Hence, in each iteration, there is a combination of collective operations which is called a task.
* | seg 0 | seg 1 | seg 2 | seg 3 |
* iter 0 | ub | | | | task: t0, contains ub
* iter 1 | lb | ub | | | task: t1, contains ub and lb
* iter 2 | | lb | ub | | task: t1, contains ub and lb
* iter 3 | | | lb | ub | task: t1, contains ub and lb
* iter 4 | | | | lb | task: t1, contains lb
*/
int
mca_coll_han_bcast_intra(void *buff,
int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
int err, seg_count = count, w_rank = ompi_comm_rank(comm);
ompi_communicator_t *low_comm, *up_comm;
ptrdiff_t extent, lb;
size_t dtype_size;
/* Create the subcommunicators */
err = mca_coll_han_comm_create(comm, han_module);
if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
ompi_datatype_get_extent(dtype, &lb, &extent);
ompi_datatype_type_size(dtype, &dtype_size);
/* use MCA parameters for now */
low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module];
up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module];
COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, dtype_size,
seg_count);
int num_segments = (count + seg_count - 1) / seg_count;
OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output,
"In HAN seg_count %d count %d num_seg %d\n",
seg_count, count, num_segments));
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int root_low_rank, root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank,
root_up_rank));
/* Create t0 tasks for the first segment */
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
/* Setup up t0 task arguments */
mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t));
mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype,
root_up_rank, root_low_rank, up_comm, low_comm,
num_segments, 0, w_rank, count - (num_segments - 1) * seg_count,
low_rank != root_low_rank);
/* Init the first task */
init_task(t0, mca_coll_han_bcast_t0_task, (void *) t);
issue_task(t0);
/* Create t1 task */
mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t);
/* Setup up t1 task arguments */
t->cur_task = t1;
/* Init the t1 task */
init_task(t1, mca_coll_han_bcast_t1_task, (void *) t);
issue_task(t1);
while (t->cur_seg <= t->num_segments - 2) {
/* Create t1 task */
t->cur_task = t1 = OBJ_NEW(mca_coll_task_t);
t->buff = (char *) t->buff + extent * seg_count;
t->cur_seg = t->cur_seg + 1;
/* Init the t1 task */
init_task(t1, mca_coll_han_bcast_t1_task, (void *) t);
issue_task(t1);
}
free(t);
return OMPI_SUCCESS;
}
/* t0 task: issue and wait for the upper level ibcast of segment 0 */
int mca_coll_han_bcast_t0_task(void *task_args)
{
mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank,
t->cur_seg));
OBJ_RELEASE(t->cur_task);
if (t->noop) {
return OMPI_SUCCESS;
}
t->up_comm->c_coll->coll_bcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank,
t->up_comm, t->up_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS;
}
/* t1 task:
* 1. issue the upper level ibcast of segment cur_seg + 1
* 2. issue the low level bcast of segment cur_seg
* 3. wait for the completion of the ibcast
*/
int mca_coll_han_bcast_t1_task(void *task_args)
{
mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args;
ompi_request_t *ibcast_req = NULL;
int tmp_count = t->seg_count;
ptrdiff_t extent, lb;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank,
t->cur_seg));
OBJ_RELEASE(t->cur_task);
ompi_datatype_get_extent(t->dtype, &lb, &extent);
if (!t->noop) {
if (t->cur_seg <= t->num_segments - 2 ) {
if (t->cur_seg == t->num_segments - 2) {
tmp_count = t->last_seg_count;
}
t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count,
tmp_count, t->dtype, t->root_up_rank,
t->up_comm, &ibcast_req,
t->up_comm->c_coll->coll_ibcast_module);
}
}
/* are we the last segment to be pushed downstream ? */
tmp_count = (t->cur_seg == (t->num_segments - 1)) ? t->last_seg_count : t->seg_count;
t->low_comm->c_coll->coll_bcast((char *) t->buff,
tmp_count, t->dtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_bcast_module);
if (NULL != ibcast_req) {
ompi_request_wait(&ibcast_req, MPI_STATUS_IGNORE);
}
return OMPI_SUCCESS;
}
int
mca_coll_han_bcast_intra_simple(void *buff,
int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
/* create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
ompi_communicator_t *low_comm, *up_comm;
int err, w_rank = ompi_comm_rank(comm);
/* Create the subcommunicators */
err = mca_coll_han_comm_create_new(comm, han_module);
if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast);
return comm->c_coll->coll_bcast(buff, count, dtype, root,
comm, comm->c_coll->coll_bcast_module);
}
low_comm = han_module->sub_comm[INTRA_NODE];
up_comm = han_module->sub_comm[INTER_NODE];
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int root_low_rank, root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: root_low_rank %d root_up_rank %d\n",
w_rank, root_low_rank, root_up_rank));
if (low_rank == root_low_rank) {
up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank,
up_comm, up_comm->c_coll->coll_bcast_module);
/* To remove when han has better sub-module selection.
For now switching to ibcast enables to make runs with libnbc. */
//ompi_request_t req;
//up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank,
// up_comm, &req, up_comm->c_coll->coll_ibcast_module);
//ompi_request_wait(&req, MPI_STATUS_IGNORE);
}
low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank,
low_comm, low_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS;
}

392
ompi/mca/coll/han/coll_han_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,392 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Most of the description of the data layout is in the
* coll_han_module.c file.
*/
#include "ompi_config.h"
#include "opal/util/show_help.h"
#include "ompi/constants.h"
#include "ompi/mca/coll/coll.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/*
* Public string showing the coll ompi_han component version number
*/
const char *mca_coll_han_component_version_string =
"Open MPI HAN collective MCA component version " OMPI_VERSION;
ompi_coll_han_components available_components[COMPONENTS_COUNT] = {
{ SELF, "self", NULL },
{ BASIC, "basic", NULL },
{ LIBNBC, "libnbc", NULL },
{ TUNED, "tuned", NULL },
{ SM, "sm", NULL },
{ SHARED, "shared", NULL },
{ ADAPT, "adapt", NULL },
{ HAN, "han", NULL }
};
/*
* Local functions
*/
static int han_open(void);
static int han_close(void);
static int han_register(void);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_coll_han_component_t mca_coll_han_component = {
/* First, fill in the super */
{
/* First, the mca_component_t struct containing meta
information about the component itself */
.collm_version = {
MCA_COLL_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "han",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component functions */
.mca_open_component = han_open,
.mca_close_component = han_close,
.mca_register_component_params = han_register,
},
.collm_data = {
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE},
/* Initialization / querying functions */
.collm_init_query = mca_coll_han_init_query,
.collm_comm_query = mca_coll_han_comm_query,
},
/* han-component specifc information */
/* (default) priority */
20,
};
/*
* Init the component
*/
static int han_open(void)
{
/* Get the global coll verbosity: it will be ours */
mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output;
return mca_coll_han_init_dynamic_rules();
}
/*
* Shut down the component
*/
static int han_close(void)
{
mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS;
}
static bool is_simple_implemented(COLLTYPE_T coll)
{
switch(coll) {
case ALLGATHER:
case ALLREDUCE:
case BCAST:
case GATHER:
case REDUCE:
return true;
default:
return false;
}
}
const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl)
{
switch(topo_lvl) {
case INTRA_NODE:
return "intra_node";
case INTER_NODE:
return "inter_node";
case GLOBAL_COMMUNICATOR:
return "global_communicator";
case NB_TOPO_LVL:
default:
return "invalid topologic level";
}
}
/*
* Register MCA params
*/
static int han_register(void)
{
mca_base_component_t *c = &mca_coll_han_component.super.collm_version;
mca_coll_han_component_t *cs = &mca_coll_han_component;
/* Generated parameters name and description */
char param_name[128], param_desc[256];
int param_desc_size;
COLLTYPE_T coll;
TOPO_LVL_T topo_lvl;
COMPONENT_T component;
cs->han_priority = 0;
(void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority);
cs->han_bcast_segsize = 65536;
(void) mca_base_component_var_register(c, "bcast_segsize",
"segment size for bcast",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_segsize);
cs->han_bcast_up_module = 0;
(void) mca_base_component_var_register(c, "bcast_up_module",
"up level module for bcast, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_up_module);
cs->han_bcast_low_module = 0;
(void) mca_base_component_var_register(c, "bcast_low_module",
"low level module for bcast, 0 sm, 1 solo",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_low_module);
cs->han_reduce_segsize = 524288;
(void) mca_base_component_var_register(c, "reduce_segsize",
"segment size for reduce",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_segsize);
cs->han_reduce_up_module = 0;
(void) mca_base_component_var_register(c, "reduce_up_module",
"up level module for allreduce, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_up_module);
cs->han_reduce_low_module = 0;
(void) mca_base_component_var_register(c, "reduce_low_module",
"low level module for allreduce, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module);
cs->han_allreduce_segsize = 524288;
(void) mca_base_component_var_register(c, "allreduce_segsize",
"segment size for allreduce",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_segsize);
cs->han_allreduce_up_module = 0;
(void) mca_base_component_var_register(c, "allreduce_up_module",
"up level module for allreduce, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_up_module);
cs->han_allreduce_low_module = 0;
(void) mca_base_component_var_register(c, "allreduce_low_module",
"low level module for allreduce, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module);
cs->han_allgather_up_module = 0;
(void) mca_base_component_var_register(c, "allgather_up_module",
"up level module for allgather, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_up_module);
cs->han_allgather_low_module = 0;
(void) mca_base_component_var_register(c, "allgather_low_module",
"low level module for allgather, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module);
cs->han_gather_up_module = 0;
(void) mca_base_component_var_register(c, "gather_up_module",
"up level module for gather, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_up_module);
cs->han_gather_low_module = 0;
(void) mca_base_component_var_register(c, "gather_low_module",
"low level module for gather, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_low_module);
cs->han_scatter_up_module = 0;
(void) mca_base_component_var_register(c, "scatter_up_module",
"up level module for scatter, 0 libnbc, 1 adapt",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_up_module);
cs->han_scatter_low_module = 0;
(void) mca_base_component_var_register(c, "scatter_low_module",
"low level module for scatter, 0 sm, 1 shared",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module);
cs->han_reproducible = 0;
(void) mca_base_component_var_register(c, "reproducible",
"whether we need reproducible results "
"(enabling this disables optimisations using topology)"
"0 disable 1 enable, default 0",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible);
/* Simple algorithms MCA parameters */
for(coll = 0 ; coll < COLLCOUNT ; coll++) {
cs->use_simple_algorithm[coll] = false;
if(is_simple_implemented(coll)) {
snprintf(param_name, sizeof(param_name), "use_simple_%s",
mca_coll_base_colltype_to_str(coll));
snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s",
mca_coll_base_colltype_to_str(coll));
mca_base_component_var_register(c, param_name,
param_desc,
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->use_simple_algorithm[coll]));
}
}
/* Dynamic rules MCA parameters */
memset(cs->mca_rules, 0,
COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T));
for(coll = 0; coll < COLLCOUNT; coll++) {
if(!mca_coll_han_is_coll_dynamic_implemented(coll)) {
continue;
}
/*
* Default values
*/
cs->mca_rules[coll][INTRA_NODE] = TUNED;
cs->mca_rules[coll][INTER_NODE] = BASIC;
cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN;
for(topo_lvl = 0; topo_lvl < NB_TOPO_LVL; topo_lvl++) {
snprintf(param_name, sizeof(param_name), "%s_dynamic_%s_module",
mca_coll_base_colltype_to_str(coll),
mca_coll_han_topo_lvl_to_str(topo_lvl));
param_desc_size = snprintf(param_desc, sizeof(param_desc),
"Collective module to use for %s on %s topological level: ",
mca_coll_base_colltype_to_str(coll),
mca_coll_han_topo_lvl_to_str(topo_lvl));
/*
* Exhaustive description:
* 0 = self; 1 = basic; 2 = libnbc; ...
* FIXME: Do not print component not providing this collective
*/
for(component = 0 ; component < COMPONENTS_COUNT ; component++) {
if(HAN == component && GLOBAL_COMMUNICATOR != topo_lvl) {
/* Han can only be used on the global communicator */
continue;
}
param_desc_size += snprintf(param_desc+param_desc_size, sizeof(param_desc) - param_desc_size,
"%d = %s; ",
component,
available_components[component].component_name);
}
mca_base_component_var_register(c, param_name, param_desc,
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->mca_rules[coll][topo_lvl]));
}
}
/* Dynamic rules */
cs->use_dynamic_file_rules = false;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"use_dynamic_file_rules",
"Enable the dynamic selection provided via the dynamic_rules_filename MCA",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->use_dynamic_file_rules));
cs->dynamic_rules_filename = NULL;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"dynamic_rules_filename",
"Configuration file containing the dynamic selection rules",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->dynamic_rules_filename));
cs->dump_dynamic_rules = false;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"dump_dynamic_rules",
"Switch used to decide if we dump dynamic rules provided by configuration file",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->dump_dynamic_rules));
if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename)
&& !cs->use_dynamic_file_rules) {
opal_output_verbose(0, cs->han_output,
"HAN: dynamic rules for collectives are hot activated."
"Check coll_han_use_dynamic_file_rules MCA parameter");
}
cs->max_dynamic_errors = 10;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"max_dynamic_errors",
"Number of dynamic rules module/function "
"errors printed on rank 0 "
"with a 0 verbosity."
"Useless if coll_base_verbose is 30 or more.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->max_dynamic_errors));
return OMPI_SUCCESS;
}

1069
ompi/mca/coll/han/coll_han_dynamic.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

214
ompi/mca/coll/han/coll_han_dynamic.h Обычный файл
Просмотреть файл

@ -0,0 +1,214 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_HAN_DYNAMIC_H
#define MCA_COLL_HAN_DYNAMIC_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/util/output.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/han/coll_han.h"
/*
* #################################################
* # Dynamic rules global architecture description #
* #################################################
*
* Han dynamic rules allow the user to define the collective
* module to call depending on the topological configuration of the
* sub-communicators and the collective parameters. This mechanism
* can also be used to fallback to the main collective on another module.
* The interface is described in coll_han_dynamic_file.h.
*
* #############################
* # Collective module storage #
* #############################
* To be able to switch between multiple collective modules, han
* directly accesses the module on the communicator. This information is
* stored in the collective structure of the communicator during the collective
* module choice at the communicator initialization. When han needs this
* information for the first time, it identifies the modules by their name and
* stores them in its module structure.
* Then, the modules are identified by their identifier.
*
* #########################
* # Dynamic rules storage #
* #########################
* There are two types of dynamic rules:
* - MCA parameter defined rules
* - File defined rules
*
* MCA parameter defined rules are stored in mca_coll_han_component.mca_rules.
* This is a double indexed table. The first index is the coresponding collective
* communication and the second index is the topological level aimed by the rule.
* These parameters define the collective component to use for a specific
* collective communication on a specific topologic level.
*
* File defined rules are stored in mca_coll_han_component.dynamic_rules.
* These structures are defined bellow. The rule storage is directy deduced
* from the rule file format.
*
* File defined rules precede MCA parameter defined rules.
*
* #######################
* # Dynamic rules usage #
* #######################
* To choose which collective module to use on a specific configuration, han
* adds an indirection on the collective call: dynamic choice functions. These
* functions do not implement any collective. First, they try to find a dynamic
* rule from file for the given collective. If there is not any rule for the
* given configuration, MCA parameter defined rules are used. Once the module
* to use is found, the correct collective implementation is called.
*
* This indirection is also used on the global communicator. This allows han
* to provide a fallback mechanism considering the collective parameters.
*
* ##############################
* # Dynamic rules choice logic #
* ##############################
* Dynamic rules choice is made with a stack logic. Each new rule precedes
* already defined rules. MCA parameters rules are the stack base. When
* a rule is needed, rules are read as a stack and the first corresponding
* encountered is chosen.
*
* Consequences:
* - If a collective identifier appears multiple times, only the last
* will be considered
* - If a topological level appears multiple times for a collective,
* only the last will be considered
* - If configuration rules or message size rules are not stored
* by increasing value, some of them will not be considered
*/
/* Dynamic rules support */
typedef enum COMPONENTS {
SELF = 0,
BASIC,
LIBNBC,
TUNED,
SM,
SHARED,
ADAPT,
HAN,
COMPONENTS_COUNT
} COMPONENT_T;
typedef struct {
COMPONENT_T id;
char* component_name;
mca_coll_base_component_t* component;
} ompi_coll_han_components;
extern ompi_coll_han_components available_components[COMPONENTS_COUNT];
/* Topologic levels */
typedef enum TOPO_LVL {
INTRA_NODE = 0,
INTER_NODE,
/* Identifies the global communicator as a topologic level */
GLOBAL_COMMUNICATOR,
NB_TOPO_LVL
} TOPO_LVL_T;
/* Rule for a specific msg size
* in a specific configuration
* for a specific collective
* in a specific topologic level */
typedef struct msg_size_rule_s {
COLLTYPE_T collective_id;
TOPO_LVL_T topologic_level;
int configuration_size;
/* Message size of the rule */
size_t msg_size;
/* Component to use on this specific configuration
* and message size */
COMPONENT_T component;
} msg_size_rule_t;
/* Rule for a specific configuration
* considering a specific collective
* in a specific topologic level */
typedef struct configuration_rule_s {
COLLTYPE_T collective_id;
TOPO_LVL_T topologic_level;
/* Number of elements of the actual topologic level
* per element of the upper topologic level */
int configuration_size;
/* Number of message size rules for this configuration */
int nb_msg_size;
/* Table of message size rules for this configuration */
msg_size_rule_t *msg_size_rules;
} configuration_rule_t;
/* Set of dynamic rules for a specific collective
* in a specific topologic level */
typedef struct topologic_rule_s {
/* Collective identifier */
COLLTYPE_T collective_id;
/* Topologic level of the rule */
TOPO_LVL_T topologic_level;
/* Rule number */
int nb_rules;
/* Table of configuration rules
* for this collective on this topologic level */
configuration_rule_t *configuration_rules;
} topologic_rule_t;
/* Set of dynamic rules for a collective */
typedef struct collective_rule_s {
COLLTYPE_T collective_id;
/* Number of topologic level for this collective */
int nb_topologic_levels;
/* Table of topologic level rules
* for this collective */
topologic_rule_t *topologic_rules;
} collective_rule_t;
/* Global dynamic rules structure */
typedef struct mca_coll_han_dynamic_rule_s {
int nb_collectives;
collective_rule_t *collective_rules;
} mca_coll_han_dynamic_rules_t;
/* Module storage */
typedef struct collective_module_storage_s {
/* Module */
mca_coll_base_module_t *module_handler;
} collective_module_storage_t;
/* Table of module storage */
typedef struct mca_coll_han_collective_modules_storage_s {
/* */
collective_module_storage_t modules[COMPONENTS_COUNT];
} mca_coll_han_collective_modules_storage_t;
/* Tests if a dynamic collective is implemented */
bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id);
COMPONENT_T mca_coll_han_component_name_to_id(const char* name);
#endif

606
ompi/mca/coll/han/coll_han_dynamic_file.c Обычный файл
Просмотреть файл

@ -0,0 +1,606 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STDIO_H
#include <stdio.h>
#endif
#include "ompi_config.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval)
#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval)
#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval)
static void check_dynamic_rules(void);
/* Current file line for verbose message */
static int fileline = 1;
int
mca_coll_han_init_dynamic_rules(void)
{
/* File management */
const char *fname;
FILE *fptr = NULL;
int nb_entries = 0, rc;
/* Loop counters */
int i, j, k, l;
/* Collective informations */
long nb_coll, coll_id;
char * coll_name = NULL;
collective_rule_t *coll_rules;
/* Topo informations */
long nb_topo, topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
long nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
long nb_msg_size;
size_t msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
long component;
/* If the dynamic rules are not used, do not even read the file */
if(!mca_coll_han_component.use_dynamic_file_rules) {
nb_coll = 0;
return OMPI_SUCCESS;
}
if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but "
"coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n");
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
if( NULL == (fptr = fopen(fname, "r")) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by "
"coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and "
"check file permissions. Rules from MCA parameters will be used instead\n",
fname);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
/* The first information of the file is the collective count */
if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for collective count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_coll);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto file_reading_error;
}
mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll;
/* Allocate collective rules */
coll_rules = malloc(nb_coll * sizeof(collective_rule_t));
mca_coll_han_component.dynamic_rules.collective_rules = coll_rules;
if(NULL == coll_rules) {
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto cannot_allocate;
}
/* Iterates on collective rules */
for( i = 0 ; i < nb_coll ; i++ ) {
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
/* Get the collective identifier */
if( getnext_string(fptr, &coll_name) < 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d."
"The rest of the input file will be ignored.\n",
fileline);
goto file_reading_error;
}
coll_id = mca_coll_base_name_to_colltype(coll_name);
if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) {
/* maybe the file was in the old format and we read the collective index instead of the name. */
char* endp;
coll_id = strtol(coll_name, &endp, 10);
if( '\0' != *endp ) { /* there is garbage in the input */
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid collective %s "
"at line %d: the collective must be at least %d and less than %d. "
"The rest of the input file will be ignored.\n",
coll_name, fileline, ALLGATHER, COLLCOUNT);
goto file_reading_error;
}
free(coll_name);
coll_name = mca_coll_base_colltype_to_str(coll_id);
}
if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"read collective id %ld at line %d but this collective is not implemented yet. "
"This is not an error but this set of rules will not be used\n",
fname, coll_id, fileline);
}
/*
* The first information of a collective rule
* is the number of topologic rules
*/
if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for topo level count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_topo);
goto file_reading_error;
}
/* Store the collective rule informations */
coll_rules[i].nb_topologic_levels = nb_topo;
coll_rules[i].collective_id = (COLLTYPE_T)coll_id;
if(0 == nb_topo) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"at line %d: an invalid value %ld is given for topo level count\n",
fname, fileline, nb_topo);
continue;
}
/* Allocate topologic rules */
topo_rules = malloc(nb_topo * sizeof(topologic_rule_t));
coll_rules[i].topologic_rules = topo_rules;
if(NULL == topo_rules) {
coll_rules[i].nb_topologic_levels = 0;
goto cannot_allocate;
}
/* Iterates on topologic rules */
for( j = 0 ; j < nb_topo ; j++ ) {
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
/* Get the topologic level identifier */
if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. "
"Topologic level must be at least %d and less than %d\n",
fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL);
goto file_reading_error;
}
/*
* The first information of a topologic rule
* is the number of configurations
*/
nb_rules = -1;
if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for rules count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_rules);
goto file_reading_error;
}
/* Store the topologic rule informations */
topo_rules[j].collective_id = coll_id;
topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl;
topo_rules[j].nb_rules = nb_rules;
if(0 == nb_rules) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"at line %d: an invalid value %ld is given for configuration rules count\n",
fname, fileline, nb_rules);
continue;
}
/* Allocate configuration rules */
conf_rules = malloc(nb_rules * sizeof(configuration_rule_t));
topo_rules[j].configuration_rules = conf_rules;
if(NULL == conf_rules) {
topo_rules[j].nb_rules = 0;
goto cannot_allocate;
}
/* Iterate on configuration rules */
for( k = 0; k < nb_rules; k++ ) {
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
/* Get the configuration size */
if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d "
"or the reader encountered an unexpected EOF the configuration size must be at least %d "
"and the first configuration size of a topologic level must be %d\n",
conf_size, fileline, 1, 1);
goto file_reading_error;
}
/*
* The first information of a configuration rule
* is the number of message size rules
*/
if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for message size rules count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_msg_size);
goto file_reading_error;
}
/* Store configuration rule information */
conf_rules[k].collective_id = coll_id;
conf_rules[k].topologic_level = topo_lvl;
conf_rules[k].configuration_size = conf_size;
conf_rules[k].nb_msg_size = nb_msg_size;
if(0 == nb_msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"at line %d: an invalid value %ld is given for message size rules count\n",
fname, fileline, nb_msg_size);
continue;
}
/* Allocate message size rules */
msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t));
conf_rules[k].msg_size_rules = msg_size_rules;
if(NULL == msg_size_rules) {
conf_rules[k].nb_msg_size = 0;
goto cannot_allocate;
}
/* Iterate on message size rules */
for( l = 0; l < nb_msg_size; l++ ) {
char* target_comp_name = NULL;
conf_rules[k].nb_msg_size = l+1;
/* Get the message size */
rc = getnext_size_t(fptr, &msg_size);
if( (rc < 0) ||
(0 == l && msg_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %" PRIsize_t " is given for message size "
"or the reader encountered an unexpected EOF. "
"The first message size rule of a configuration must be 0\n",
fname, fileline, msg_size);
goto file_reading_error;
}
/* Get the component identifier for this message size rule */
if( getnext_string(fptr, &target_comp_name) < 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: cannot read the name of a collective component\n",
fname, fileline);
goto file_reading_error;
}
component = mca_coll_han_component_name_to_id(target_comp_name);
if( (component < SELF) || (component >= COMPONENTS_COUNT) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid collective component name %s was given or the "
"reader encountered an unexpected EOF. Collective component id must be at "
"least %d and less than %d\n",
fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT);
free(target_comp_name);
goto file_reading_error;
}
/* Store message size rule information */
msg_size_rules[l].collective_id = coll_id;
msg_size_rules[l].topologic_level = topo_lvl;
msg_size_rules[l].configuration_size = conf_size;
msg_size_rules[l].msg_size = msg_size;
msg_size_rules[l].component = (COMPONENT_T)component;
nb_entries++;
/* do we have the optional segment length */
if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n");
long seglength;
if( 0 != topo_lvl ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"file %s line %d found segment lengths for topological collective at level != 0 "
"for collective %s component %s. These values will be ignored.\n",
fname, fileline, coll_name, target_comp_name);
}
while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) {
if( getnext_long(fptr, &seglength) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"file %s line %d found end of file while reading the optional list "
"of segment lengths for collective %s component %s\n",
fname, fileline, coll_name, target_comp_name);
free(target_comp_name);
goto file_reading_error;
}
}
}
free(target_comp_name);
}
}
}
if( NULL != coll_name ) {
free(coll_name);
coll_name = NULL;
}
}
if( getnext_long(fptr, &nb_coll) > 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: "
"rule reading is over but reader does not seem to have reached the end of the file\n",
fname, fileline);
}
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n",
nb_entries, fname);
if(mca_coll_han_component.dump_dynamic_rules) {
mca_coll_han_dump_dynamic_rules();
}
fclose(fptr);
check_dynamic_rules();
return OMPI_SUCCESS;
cannot_allocate:
/* The dynamic rules allocation failed
* Free the already allocated rules and return a failure
*/
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"cannot allocate dynamic rules\n");
/* Do not check free_dynamic_rules
* because we are returning OMPI_ERROR anyway */
mca_coll_han_free_dynamic_rules();
return OMPI_ERROR;
file_reading_error:
if( NULL != coll_name ) {
free(coll_name);
}
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"could not fully read dynamic rules file. "
"Will use mca parameters defined rules. "
"To see error detail, please set "
"collective verbosity level over 5\n");
if(fptr) {
fclose (fptr);
}
mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS;
}
void
mca_coll_han_free_dynamic_rules(void)
{
/* Loop counters */
int i, j, k;
/* Loop ranges */
int nb_coll, nb_topo, nb_conf;
/* Aliases */
collective_rule_t *coll_rules;
topologic_rule_t *topo_rules;
configuration_rule_t *conf_rules;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
nb_conf = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_conf ; k++) {
if(conf_rules[k].nb_msg_size > 0) {
free(conf_rules[k].msg_size_rules);
}
}
if(nb_conf > 0) {
free(conf_rules);
}
}
if(nb_topo > 0) {
free(topo_rules);
}
}
if(nb_coll > 0) {
free(coll_rules);
}
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
}
/*
* Try to find any logical issue in dynamic rules
*/
static void check_dynamic_rules(void)
{
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size;
size_t msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for( i = 0; i < nb_coll; i++ ) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for( j = 0; j < nb_topo; j++ ) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for( k = 0; k < nb_rules; k++ ) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"for collective %d on topological level %d: "
"configuration sizes %d and %d are not sorted by increasing value\n",
coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size);
}
for( l = 0; l < nb_msg_size; l++ ) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"for collective %d on topological level %d with configuration size %d: "
"message sizes %" PRIsize_t " and %" PRIsize_t " are "
"not sorted by increasing value\n",
coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size);
}
if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"for collective %d on topological level %d with configuration size %d "
"for message size %" PRIsize_t ": han collective component %d "
"can only be activated for topology level %d\n",
coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR);
}
}
}
}
}
}
void mca_coll_han_dump_dynamic_rules(void)
{
int nb_entries = 0;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(int i = 0; i < nb_coll; i++ ) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(int j = 0; j < nb_topo; j++ ) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(int k = 0; k < nb_rules; k++ ) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
for(int l = 0; l < nb_msg_size; l++ ) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
opal_output(mca_coll_han_component.han_output,
"coll:han:dump_dynamic_rules %d collective %d (%s) "
"topology level %d (%s) configuration size %d "
"mesage size %d -> collective component %d (%s)\n",
nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id),
topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size,
msg_size, component, available_components[component].component_name);
nb_entries++;
}
}
}
}
}

110
ompi/mca/coll/han/coll_han_dynamic_file.h Обычный файл
Просмотреть файл

@ -0,0 +1,110 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_HAN_DYNAMIC_FILE_H
#define MCA_COLL_HAN_DYNAMIC_FILE_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/util/output.h"
/*
* ############################
* # MCA parameters interface #
* ############################
* An MCA parameter defined rule allows the user to choose which collective
* module will be used for a specific collective communication on a specific
* topological level. The standard name for these parameters is:
* [collective]_dynamic_[topologic_level]_module
*
* #######################
* # Dynamic file format #
* #######################
* File defined rules precede MCA parameter defined rule.
* To activate file reader, the MCA parameter use_dynamic_file_rules must
* be set to true. The path to the dynamic file is given by the MCA
* parameter dynamic_rules_filename. If there is any issue reading the file,
* the file is considered as invalid and only MCA parameter defined rules are
* used. If a potential logical issue is identified in the file, a
* warning is printed but the file is not considered as invalid.
*
* The file is built recursively.
* A set of rules of a type is built as follows:
* Number of rules of the set
* Rule1
* Rule2
* ...
*
* A rule of the level i is built as follows (excluding message size rule):
* Rule property
* Set of rules of level i+1
*
* A message size rule is built as follows:
* Message_size Component
*
* Rule properties are (by increasing level):
* - Collective identifier:
* Defined in ompi/mca/coll/base/coll_base_functions.h.
* - Topologic level:
* Defined in coll_han_dynamic.h. It defines the communicator
* topology level. This is GLOBAL_COMMUNICATOR for the user
* communicator and the corresponding level for sub-communicators
* created by han.
* - Configuration size:
* The configuration size is the number of elements in a topology level.
* For example, if topology levels are intra-node and inter-node, it can
* be the number of MPI ranks per node or the number of nodes in the global
* communicator. For the GLOBAL_COMMUNICATOR topologic level,
* the configuration size is the communicator size.
* - Message_size Component:
* This is the message size, in bytes, of the message. Component is
* the component identifier to use for this collective on this
* communicator with this message size. Components identifier are
* defined in coll_han_dynamic.h
*
* Here is an example of a dynamic rules file:
* 2 # Collective count
* 7 # Collective identifier 1 (defined in ompi/mca/coll/base/coll_base_functions.h)
* 2 # Topologic level count
* 0 # Topologic level identifier 1
* 1 # Configuration count
* 1 # Configuration size 1
* 2 # Message size rules count
* 0 3 # Message size 1 and component identifier
* 128 1 # Message size 2 and component identifier
* 1 # Topologic level identifier 2
* 1 # Configuration count
* 1 # Configuration size 1
* 1 # Message size rules count
* 0 1 # Message size 1 and component identifier
* 3 # Collective identifier 2
* # Set of topological rules
*
* Note that configuration size and message size rules define minimal
* values and each new rule precede every other rules. This property
* implies that this types of rules must be sorted by increasing value.
* If they are not, some rules wont be used.
*
* The counts define a stack. If the count is set to x, the reader will
* attempt to read x rules of the corresponding type. If a set of rules
* has an invalid count, this is an error and it might not be detected by
* the reader.
*/
int mca_coll_han_init_dynamic_rules(void);
void mca_coll_han_free_dynamic_rules(void);
void mca_coll_han_dump_dynamic_rules(void);
#endif

511
ompi/mca/coll/han/coll_han_gather.c Обычный файл
Просмотреть файл

@ -0,0 +1,511 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h"
static int mca_coll_han_gather_lg_task(void *task_args);
static int mca_coll_han_gather_ug_task(void *task_args);
/* only work with regular situation (each node has equal number of processes) */
static inline void
mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args,
mca_coll_task_t * cur_task,
void *sbuf,
void *sbuf_inter_free,
int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
struct ompi_datatype_t *rdtype,
int root,
int root_up_rank,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank, bool noop, bool is_mapbycore, ompi_request_t * req)
{
args->cur_task = cur_task;
args->sbuf = sbuf;
args->sbuf_inter_free = sbuf_inter_free;
args->scount = scount;
args->sdtype = sdtype;
args->rbuf = rbuf;
args->rcount = rcount;
args->rdtype = rdtype;
args->root = root;
args->root_up_rank = root_up_rank;
args->root_low_rank = root_low_rank;
args->up_comm = up_comm;
args->low_comm = low_comm;
args->w_rank = w_rank;
args->noop = noop;
args->is_mapbycore = is_mapbycore;
args->req = req;
}
int
mca_coll_han_gather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t * module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
int w_rank, w_size; /* information about the global communicator */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
char *reorder_buf = NULL, *reorder_rbuf = NULL;
int i, err, *vranks, low_rank, low_size, *topo;
ompi_request_t *temp_request = NULL;
/* Create the subcommunicators */
err = mca_coll_han_comm_create(comm, han_module);
if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
}
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/* Set up request */
temp_request = OBJ_NEW(ompi_request_t);
temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = han_request_free;
temp_request->req_status = (ompi_status_public_t){0};
temp_request->req_complete = REQUEST_PENDING;
/* create the subcommunicators */
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module];
/* Get the 'virtual ranks' mapping correspondong to the communicators */
vranks = han_module->cached_vranks;
/* information about sub-communicators */
low_rank = ompi_comm_rank(low_comm);
low_size = ompi_comm_size(low_comm);
/* Get root ranks for low and up comms */
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n",
w_rank, root, root_low_rank, root_up_rank));
/* Allocate reorder buffers */
if (w_rank == root) {
/* if the processes are mapped-by core, no need to reorder:
* distribution of ranks on core first and node next,
* in a increasing order for both patterns */
if (han_module->is_mapbycore) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather is_bycore: ", w_rank));
reorder_rbuf = (char *)rbuf;
} else {
/* Need a buffer to store unordered final result */
ptrdiff_t rsize, rgap;
rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * w_size,
&rgap);
reorder_buf = (char *)malloc(rsize); //TODO:free
/* rgap is the size of unused space at the start of the datatype */
reorder_rbuf = reorder_buf - rgap;
if (MPI_IN_PLACE == sbuf) {
ptrdiff_t rextent;
ompi_datatype_type_extent(rdtype, &rextent);
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * w_rank;
ptrdiff_t dest_shift = block_size * w_rank;
ompi_datatype_copy_content_same_ddt(rdtype,
(ptrdiff_t)rcount,
(char *)rbuf + dest_shift,
reorder_rbuf + src_shift);
}
}
}
/* Create lg task */
mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t);
/* Setup lg task arguments */
mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t));
mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf,
rcount, rdtype, root, root_up_rank, root_low_rank, up_comm,
low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, temp_request);
/* Init lg task */
init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args));
/* Issure lg task */
issue_task(lg);
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
/* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are
* mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from
* low gather will be 0 2 4 6 and 1 3 5 7.
* So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered.
* The 3rd element (4) must be recopied at the 4th place. In general, the
* i-th element must be recopied at the place given by the i-th entry of the
* topology, which is topo[i*topolevel +1]
*/
/* reorder rbuf based on rank */
if (w_rank == root && !han_module->is_mapbycore) {
ptrdiff_t rextent;
ompi_datatype_type_extent(rdtype, &rextent);
for (i=0; i<w_size; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather copy from %d to %d\n",
w_rank,
i * 2 + 1,
topo[i * 2 + 1]));
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * i;
ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * 2 + 1];
ompi_datatype_copy_content_same_ddt(rdtype,
(ptrdiff_t)rcount,
reorder_rbuf + src_shift,
(char *)rbuf + dest_shift);
}
free(reorder_buf);
}
return OMPI_SUCCESS;
}
/* Perform a intra node gather and when it ends launch the inter node gather */
int mca_coll_han_gather_lg_task(void *task_args)
{
mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: lg\n",
t->w_rank));
ompi_datatype_t *dtype;
size_t count;
if (t->w_rank == t->root) {
dtype = t->rdtype;
count = t->rcount;
} else {
dtype = t->sdtype;
count = t->scount;
}
/* If the process is one of the node leader */
char *tmp_buf = NULL;
char *tmp_rbuf = NULL;
if (!t->noop) {
/* if the process is one of the node leader, allocate the intermediary
* buffer to gather on the low sub communicator */
int low_size = ompi_comm_size(t->low_comm);
int low_rank = ompi_comm_rank(t->low_comm);
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&dtype->super,
count * low_size,
&rgap);
tmp_buf = (char *) malloc(rsize);
tmp_rbuf = tmp_buf - rgap;
if (t->w_rank == t->root) {
if (MPI_IN_PLACE == t->sbuf) {
ptrdiff_t rextent;
ompi_datatype_type_extent(dtype, &rextent);
ptrdiff_t block_size = rextent * (ptrdiff_t)count;
ptrdiff_t src_shift = block_size * t->w_rank;
ptrdiff_t dest_shift = block_size * low_rank;
ompi_datatype_copy_content_same_ddt(dtype,
(ptrdiff_t)count,
tmp_rbuf + dest_shift,
(char *)t->rbuf + src_shift);
}
}
}
/* Low level (usually intra-node or shared memory) node gather */
t->low_comm->c_coll->coll_gather((char *)t->sbuf,
count,
dtype,
tmp_rbuf,
count,
dtype,
t->root_low_rank,
t->low_comm,
t->low_comm->c_coll->coll_gather_module);
/* Prepare up comm gather */
t->sbuf = tmp_rbuf;
t->sbuf_inter_free = tmp_buf;
/* Create ug (upper level all-gather) task */
mca_coll_task_t *ug = t->cur_task;
/* Init ug task */
init_task(ug, mca_coll_han_gather_ug_task, (void *) t);
/* Issure ug task */
issue_task(ug);
return OMPI_SUCCESS;
}
/* ug: upper level (intra-node) gather task */
int mca_coll_han_gather_ug_task(void *task_args)
{
mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args;
OBJ_RELEASE(t->cur_task);
if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Han Gather: ug noop\n", t->w_rank));
} else {
ompi_datatype_t *dtype;
size_t count;
if (t->w_rank == t->root) {
dtype = t->rdtype;
count = t->rcount;
} else {
dtype = t->sdtype;
count = t->scount;
}
int low_size = ompi_comm_size(t->low_comm);
/* inter node gather */
t->up_comm->c_coll->coll_gather((char *)t->sbuf,
count*low_size,
dtype,
(char *)t->rbuf,
count*low_size,
dtype,
t->root_up_rank,
t->up_comm,
t->up_comm->c_coll->coll_gather_module);
if (t->sbuf_inter_free != NULL) {
free(t->sbuf_inter_free);
t->sbuf_inter_free = NULL;
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Han Gather: ug gather finish\n", t->w_rank));
}
ompi_request_t *temp_req = t->req;
free(t);
ompi_request_complete(temp_req, 1);
return OMPI_SUCCESS;
}
/* only work with regular situation (each node has equal number of processes) */
int
mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
int *topo, w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm);
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather);
return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, comm->c_coll->coll_gather_module);
}
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
ompi_datatype_t *dtype;
size_t count;
if (w_rank == root) {
dtype = rdtype;
count = rcount;
} else {
dtype = sdtype;
count = scount;
}
/* Get the 'virtual ranks' mapping corresponding to the communicators */
int *vranks = han_module->cached_vranks;
/* information about sub-communicators */
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
/* Get root ranks for low and up comms */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
/* allocate buffer to store unordered result on root
* if the processes are mapped-by core, no need to reorder:
* distribution of ranks on core first and node next,
* in a increasing order for both patterns */
char *reorder_buf = NULL; // allocated memory
char *reorder_buf_start = NULL; // start of the data
if (w_rank == root) {
if (han_module->is_mapbycore) {
reorder_buf_start = (char *)rbuf;
} else {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Gather needs reordering: ", w_rank));
ptrdiff_t rgap = 0;
ptrdiff_t rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * w_size,
&rgap);
reorder_buf = (char *)malloc(rsize);
/* rgap is the size of unused space at the start of the datatype */
reorder_buf_start = reorder_buf - rgap;
}
}
/* allocate the intermediary buffer
* to gather on leaders on the low sub communicator */
char *tmp_buf = NULL; // allocated memory
char *tmp_buf_start = NULL; // start of the data
if (low_rank == root_low_rank) {
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&dtype->super,
count * low_size,
&rgap);
tmp_buf = (char *) malloc(rsize);
tmp_buf_start = tmp_buf - rgap;
}
/* 1. low gather on nodes leaders */
low_comm->c_coll->coll_gather((char *)sbuf,
count,
dtype,
tmp_buf_start,
count,
dtype,
root_low_rank,
low_comm,
low_comm->c_coll->coll_gather_module);
/* 2. upper gather (inter-node) between node leaders */
if (low_rank == root_low_rank) {
up_comm->c_coll->coll_gather((char *)tmp_buf_start,
count*low_size,
dtype,
(char *)reorder_buf_start,
count*low_size,
dtype,
root_up_rank,
up_comm,
up_comm->c_coll->coll_gather_module);
if (tmp_buf != NULL) {
free(tmp_buf);
tmp_buf = NULL;
tmp_buf_start = NULL;
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Future Gather: ug gather finish\n", w_rank));
}
/* 3. reorder data on root into rbuf
* if ranks are not mapped in topological order, data needs to be reordered
* (see reorder_gather)
*/
if (w_rank == root && !han_module->is_mapbycore) {
ompi_coll_han_reorder_gather(reorder_buf_start,
rbuf, rcount, rdtype,
comm, topo);
free(reorder_buf);
}
return OMPI_SUCCESS;
}
/* Reorder after gather operation, for unordered ranks
*
* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are
* mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from
* low gather will be 0 2 4 6 and 1 3 5 7.
* So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered.
* The 3rd element (4) must be recopied at the 4th place. In general, the
* i-th element must be recopied at the place given by the i-th entry of the
* topology, which is topo[i*topolevel +1]
*/
void
ompi_coll_han_reorder_gather(const void *sbuf,
void *rbuf, int rcount,
struct ompi_datatype_t *dtype,
struct ompi_communicator_t *comm,
int * topo)
{
int i, topolevel = 2; // always 2 levels in topo
int w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm);
ptrdiff_t rextent;
ompi_datatype_type_extent(dtype, &rextent);
for ( i = 0; i < w_size; i++ ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future reorder from %d to %d\n",
w_rank,
i * topolevel + 1,
topo[i * topolevel + 1]));
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * i;
ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * topolevel + 1];
ompi_datatype_copy_content_same_ddt(dtype,
(ptrdiff_t)rcount,
(char *)rbuf + dest_shift,
(char *)sbuf + src_shift);
}
}

339
ompi/mca/coll/han/coll_han_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,339 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
/*
* Local functions
*/
static int han_module_enable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm);
static int mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm);
#define CLEAN_PREV_COLL(HANDLE, NAME) \
do { \
(HANDLE)->fallback.NAME.NAME = NULL; \
(HANDLE)->fallback.NAME.module = NULL; \
} while (0)
/*
* Module constructor
*/
static void han_module_clear(mca_coll_han_module_t *han_module)
{
CLEAN_PREV_COLL(han_module, allgather);
CLEAN_PREV_COLL(han_module, allgatherv);
CLEAN_PREV_COLL(han_module, allreduce);
CLEAN_PREV_COLL(han_module, bcast);
CLEAN_PREV_COLL(han_module, reduce);
CLEAN_PREV_COLL(han_module, gather);
CLEAN_PREV_COLL(han_module, scatter);
han_module->reproducible_reduce = NULL;
han_module->reproducible_reduce_module = NULL;
han_module->reproducible_allreduce = NULL;
han_module->reproducible_allreduce_module = NULL;
}
static void mca_coll_han_module_construct(mca_coll_han_module_t * module)
{
int i;
module->enabled = true;
module->super.coll_module_disable = mca_coll_han_module_disable;
module->cached_low_comms = NULL;
module->cached_up_comms = NULL;
module->cached_vranks = NULL;
module->cached_topo = NULL;
module->is_mapbycore = false;
module->storage_initialized = false;
for( i = 0; i < NB_TOPO_LVL; i++ ) {
module->sub_comm[i] = NULL;
}
for( i = SELF; i < COMPONENTS_COUNT; i++ ) {
module->modules_storage.modules[i].module_handler = NULL;
}
module->dynamic_errors = 0;
han_module_clear(module);
}
#define OBJ_RELEASE_IF_NOT_NULL(obj) \
do { \
if (NULL != (obj)) { \
OBJ_RELEASE(obj); \
} \
} while (0)
/*
* Module destructor
*/
static void
mca_coll_han_module_destruct(mca_coll_han_module_t * module)
{
int i;
module->enabled = false;
if (module->cached_low_comms != NULL) {
for (i = 0; i < COLL_HAN_LOW_MODULES; i++) {
ompi_comm_free(&(module->cached_low_comms[i]));
module->cached_low_comms[i] = NULL;
}
free(module->cached_low_comms);
module->cached_low_comms = NULL;
}
if (module->cached_up_comms != NULL) {
for (i = 0; i < COLL_HAN_UP_MODULES; i++) {
ompi_comm_free(&(module->cached_up_comms[i]));
module->cached_up_comms[i] = NULL;
}
free(module->cached_up_comms);
module->cached_up_comms = NULL;
}
if (module->cached_vranks != NULL) {
free(module->cached_vranks);
module->cached_vranks = NULL;
}
if (module->cached_topo != NULL) {
free(module->cached_topo);
module->cached_topo = NULL;
}
for(i=0 ; i<NB_TOPO_LVL ; i++) {
if(NULL != module->sub_comm[i]) {
ompi_comm_free(&(module->sub_comm[i]));
}
}
OBJ_RELEASE_IF_NOT_NULL(module->previous_allgather_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_allreduce_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_bcast_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_gather_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_reduce_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_scatter_module);
han_module_clear(module);
}
OBJ_CLASS_INSTANCE(mca_coll_han_module_t,
mca_coll_base_module_t,
mca_coll_han_module_construct,
mca_coll_han_module_destruct);
/*
* Initial query function that is invoked during MPI_INIT, allowing
* this component to disqualify itself if it doesn't support the
* required level of thread support. This function is invoked exactly
* once.
*/
int mca_coll_han_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:init_query: pick me! pick me!");
return OMPI_SUCCESS;
}
/*
* Invoked when there's a new communicator that has been created.
* Look at the communicator and decide which set of functions and
* priority we want to return.
*/
mca_coll_base_module_t *
mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
{
int flag;
char info_val[OPAL_MAX_INFO_VAL+1];
mca_coll_han_module_t *han_module;
/*
* If we're intercomm, or if there's only one process in the communicator
*/
if (OMPI_COMM_IS_INTER(comm)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): intercomm; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
if (1 == ompi_comm_size(comm)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): comm is too small; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
if( !ompi_group_have_remote_peers(comm->c_local_group) ) {
/* The group only contains local processes. Disable HAN for now */
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): comm has only local processes; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
/* Get the priority level attached to this module. If priority is less
* than or equal to 0, then the module is unavailable. */
*priority = mca_coll_han_component.han_priority;
if (mca_coll_han_component.han_priority <= 0) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): priority too low; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
han_module = OBJ_NEW(mca_coll_han_module_t);
if (NULL == han_module) {
return NULL;
}
/* All is good -- return a module */
han_module->topologic_level = GLOBAL_COMMUNICATOR;
if (NULL != comm->super.s_info) {
/* Get the info value disaqualifying coll components */
opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level",
sizeof(info_val), info_val, &flag);
if (flag) {
if (0 == strcmp(info_val, "INTER_NODE")) {
han_module->topologic_level = INTER_NODE;
} else {
han_module->topologic_level = INTRA_NODE;
}
}
}
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatterv = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic;
han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic;
han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic;
han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic;
han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic;
if (GLOBAL_COMMUNICATOR == han_module->topologic_level) {
/* We are on the global communicator, return topological algorithms */
han_module->super.coll_allgatherv = NULL;
} else {
/* We are on a topologic sub-communicator, return only the selector */
han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic;
}
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): pick me! pick me!",
comm->c_contextid, comm->c_name);
return &(han_module->super);
}
/*
* In this macro, the following variables are supposed to have been declared
* in the caller:
* . ompi_communicator_t *comm
* . mca_coll_han_module_t *han_module
*/
#define HAN_SAVE_PREV_COLL_API(__api) \
do { \
if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \
opal_output_verbose(1, ompi_coll_base_framework.framework_output, \
"(%d/%s): no underlying " # __api"; disqualifying myself", \
comm->c_contextid, comm->c_name); \
goto handle_error; \
} \
han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \
han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \
OBJ_RETAIN(han_module->previous_ ## __api ## _module); \
} while(0)
/*
* Init module on the communicator
*/
static int
han_module_enable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{
mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module;
HAN_SAVE_PREV_COLL_API(allgather);
HAN_SAVE_PREV_COLL_API(allgatherv);
HAN_SAVE_PREV_COLL_API(allreduce);
HAN_SAVE_PREV_COLL_API(bcast);
HAN_SAVE_PREV_COLL_API(gather);
HAN_SAVE_PREV_COLL_API(reduce);
HAN_SAVE_PREV_COLL_API(scatter);
/* set reproducible algos */
mca_coll_han_reduce_reproducible_decision(comm, module);
mca_coll_han_allreduce_reproducible_decision(comm, module);
return OMPI_SUCCESS;
handle_error:
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module);
return OMPI_ERROR;
}
/*
* Module disable
*/
static int
mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{
mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module;
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module);
han_module_clear(han_module);
return OMPI_SUCCESS;
}
/*
* Free the han request
*/
int han_request_free(ompi_request_t ** request)
{
(*request)->req_state = OMPI_REQUEST_INVALID;
OBJ_RELEASE(*request);
*request = MPI_REQUEST_NULL;
return OMPI_SUCCESS;
}

444
ompi/mca/coll/han/coll_han_reduce.c Обычный файл
Просмотреть файл

@ -0,0 +1,444 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h"
static int mca_coll_han_reduce_t0_task(void *task_args);
static int mca_coll_han_reduce_t1_task(void *task_args);
static inline void
mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t * cur_task, void *sbuf, void *rbuf,
int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op,
int root_up_rank, int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int num_segments, int cur_seg, int w_rank, int last_seg_count,
bool noop, bool is_tmp_rbuf)
{
args->cur_task = cur_task;
args->sbuf = sbuf;
args->rbuf = rbuf;
args->seg_count = seg_count;
args->dtype = dtype;
args->op = op;
args->root_low_rank = root_low_rank;
args->root_up_rank = root_up_rank;
args->up_comm = up_comm;
args->low_comm = low_comm;
args->num_segments = num_segments;
args->cur_seg = cur_seg;
args->w_rank = w_rank;
args->last_seg_count = last_seg_count;
args->noop = noop;
args->is_tmp_rbuf = is_tmp_rbuf;
}
/*
* Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce:
* lb: low level (shared-memory or intra-node) reduce.
* ub: upper level (inter-node) reduce
* Hence, in each iteration, there is a combination of collective operations which is called a task.
* | seg 0 | seg 1 | seg 2 | seg 3 |
* iter 0 | lr | | | | task: t0, contains lr
* iter 1 | ur | lr | | | task: t1, contains ur and lr
* iter 2 | | ur | lr | | task: t1, contains ur and lr
* iter 3 | | | ur | lr | task: t1, contains ur and lr
* iter 4 | | | | ur | task: t1, contains ur
*/
int
mca_coll_han_reduce_intra(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
ompi_op_t* op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t * module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
ptrdiff_t extent, lb;
int seg_count = count, w_rank;
size_t dtype_size;
/* No support for non-commutative operations */
if(!ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this operation. Fall back on another component\n"));
goto prev_reduce_intra;
}
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all modules */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
}
ompi_datatype_get_extent(dtype, &lb, &extent);
w_rank = ompi_comm_rank(comm);
ompi_datatype_type_size(dtype, &dtype_size);
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
/* use MCA parameters for now */
low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module];
up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module];
COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, dtype_size,
seg_count);
int num_segments = (count + seg_count - 1) / seg_count;
OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output,
"In HAN seg_count %d count %d num_seg %d\n",
seg_count, count, num_segments));
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int up_rank = ompi_comm_rank(up_comm);
int root_low_rank;
int root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank,
root_up_rank));
void *tmp_rbuf = rbuf;
void *tmp_rbuf_to_free = NULL;
if (low_rank == root_low_rank && root_up_rank != up_rank) {
/* allocate 2 segments on node leaders that are not the global root */
tmp_rbuf = malloc(2*extent*seg_count);
tmp_rbuf_to_free = tmp_rbuf;
}
/* Create t0 tasks for the first segment */
mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t);
/* Setup up t0 task arguments */
mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t));
mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) tmp_rbuf, seg_count, dtype,
op, root_up_rank, root_low_rank, up_comm, low_comm,
num_segments, 0, w_rank, count - (num_segments - 1) * seg_count,
low_rank != root_low_rank, (NULL != tmp_rbuf_to_free));
/* Init the first task */
init_task(t0, mca_coll_han_reduce_t0_task, (void *) t);
issue_task(t0);
/* Create t1 task */
mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t);
/* Setup up t1 task arguments */
t->cur_task = t1;
/* Init the t1 task */
init_task(t1, mca_coll_han_reduce_t1_task, (void *) t);
issue_task(t1);
while (t->cur_seg <= t->num_segments - 2) {
/* Create t1 task */
mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t);
/* Setup up t1 task arguments */
t->cur_task = t1;
t->sbuf = (char *) t->sbuf + extent * t->seg_count;
if (up_rank == root_up_rank) {
t->rbuf = (char *) t->rbuf + extent * t->seg_count;
}
t->cur_seg = t->cur_seg + 1;
/* Init the t1 task */
init_task(t1, mca_coll_han_reduce_t1_task, (void *) t);
issue_task(t1);
}
free(t);
free(tmp_rbuf_to_free);
return OMPI_SUCCESS;
prev_reduce_intra:
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm,
han_module->previous_reduce_module);
}
/* t0 task: issue and wait for the low level reduce of segment 0 */
int mca_coll_han_reduce_t0_task(void *task_args)
{
mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank,
t->cur_seg));
OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype,
t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
return OMPI_SUCCESS;
}
/* t1 task */
int mca_coll_han_reduce_t1_task(void *task_args) {
mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank,
t->cur_seg));
OBJ_RELEASE(t->cur_task);
ptrdiff_t extent, lb;
int cur_seg = t->cur_seg;
ompi_datatype_get_extent(t->dtype, &lb, &extent);
ompi_request_t *ireduce_req = NULL;
if (!t->noop) {
int tmp_count = t->seg_count;
if (cur_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
int up_rank = ompi_comm_rank(t->up_comm);
/* ur of cur_seg */
if (up_rank == t->root_up_rank) {
t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, tmp_count, t->dtype,
t->op, t->root_up_rank, t->up_comm, &ireduce_req,
t->up_comm->c_coll->coll_ireduce_module);
} else {
/* this is a node leader that is not root so alternate between the two allocated segments */
char *tmp_sbuf = (char*)t->rbuf + (cur_seg % 2)*(extent * t->seg_count);
t->up_comm->c_coll->coll_ireduce(tmp_sbuf, NULL, tmp_count,
t->dtype, t->op, t->root_up_rank, t->up_comm,
&ireduce_req, t->up_comm->c_coll->coll_ireduce_module);
}
}
/* lr of cur_seg+1 */
int next_seg = cur_seg + 1;
if (next_seg <= t->num_segments - 1) {
int tmp_count = t->seg_count;
char *tmp_rbuf = NULL;
if (next_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) {
tmp_count = t->last_seg_count;
}
if (t->is_tmp_rbuf) {
tmp_rbuf = (char*)t->rbuf + (next_seg % 2)*(extent * t->seg_count);
} else if (NULL != t->rbuf) {
tmp_rbuf = (char*)t->rbuf + extent * t->seg_count;
}
t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count,
(char *) tmp_rbuf, tmp_count,
t->dtype, t->op, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_reduce_module);
}
if (!t->noop && ireduce_req) {
ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE);
}
return OMPI_SUCCESS;
}
/* In case of non regular situation (imbalanced number of processes per nodes),
* a fallback is made on the next component that provides a reduce in priority order */
int
mca_coll_han_reduce_intra_simple(const void *sbuf,
void* rbuf,
int count,
struct ompi_datatype_t *dtype,
ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank; /* information about the global communicator */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
int ret;
int *vranks, low_rank, low_size;
ptrdiff_t rsize, rgap = 0;
void * tmp_buf;
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* No support for non-commutative operations */
if(!ompi_op_is_commute(op)){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this operation. Fall back on another component\n"));
goto prev_reduce_intra;
}
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce);
return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root,
comm, comm->c_coll->coll_reduce_module);
}
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module];
/* Get the 'virtual ranks' mapping corresponding to the communicators */
vranks = han_module->cached_vranks;
w_rank = ompi_comm_rank(comm);
low_rank = ompi_comm_rank(low_comm);
low_size = ompi_comm_size(low_comm);
/* Get root ranks for low and up comms */
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
if (root_low_rank == low_rank && w_rank != root) {
rsize = opal_datatype_span(&dtype->super, (int64_t)count, &rgap);
tmp_buf = malloc(rsize);
if (NULL == tmp_buf) {
return OMPI_ERROR;
}
} else {
/* global root rbuf is valid, local non-root do not need buffers */
tmp_buf = rbuf;
}
/* No need to handle MPI_IN_PLACE: only the global root may ask for it and
* it is ok to use it for intermediary reduces since it is also a local root*/
/* Low_comm reduce */
ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)tmp_buf,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){
if (root_low_rank == low_rank && w_rank != root){
free(tmp_buf);
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"HAN/REDUCE: low comm reduce failed. "
"Falling back to another component\n"));
goto prev_reduce_intra;
}
/* Up_comm reduce */
if (root_low_rank == low_rank ){
if(w_rank != root){
ret = up_comm->c_coll->coll_reduce((char *)tmp_buf, NULL,
count, dtype, op, root_up_rank,
up_comm, up_comm->c_coll->coll_reduce_module);
free(tmp_buf);
} else {
/* Take advantage of any optimisation made for IN_PLACE
* communcations */
ret = up_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)tmp_buf,
count, dtype, op, root_up_rank,
up_comm, up_comm->c_coll->coll_reduce_module);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"HAN/REDUCE: low comm reduce failed.\n"));
return ret;
}
}
return OMPI_SUCCESS;
prev_reduce_intra:
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm, han_module->previous_reduce_module);
}
/* Find a fallback on reproducible algorithm
* use tuned or basic or if impossible whatever available
*/
int
mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank = ompi_comm_rank(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* populate previous modules_storage*/
mca_coll_han_get_all_coll_modules(comm, han_module);
/* try availability of reproducible modules */
int fallbacks[] = {TUNED, BASIC};
int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks);
int i;
for (i=0; i<fallbacks_len; i++) {
int fallback = fallbacks[i];
mca_coll_base_module_t *fallback_module
= han_module->modules_storage.modules[fallback].module_handler;
if (fallback_module != NULL && fallback_module->coll_reduce != NULL) {
if (0 == w_rank) {
opal_output_verbose(30, mca_coll_han_component.han_output,
"coll:han:reduce_reproducible: "
"fallback on %s\n",
available_components[fallback].component_name);
}
han_module->reproducible_reduce_module = fallback_module;
han_module->reproducible_reduce = fallback_module->coll_reduce;
return OMPI_SUCCESS;
}
}
/* fallback of the fallback */
if (0 == w_rank) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:reduce_reproducible_decision: "
"no reproducible fallback\n");
}
han_module->reproducible_reduce_module =
han_module->previous_reduce_module;
han_module->reproducible_reduce = han_module->previous_reduce;
return OMPI_SUCCESS;
}
/* Fallback on reproducible algorithm */
int
mca_coll_han_reduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
return han_module->reproducible_reduce(sbuf, rbuf, count, dtype,
op, root, comm,
han_module
->reproducible_reduce_module);
}

258
ompi/mca/coll/han/coll_han_scatter.c Обычный файл
Просмотреть файл

@ -0,0 +1,258 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_han_trigger.h"
static int mca_coll_han_scatter_us_task(void *task_args);
static int mca_coll_han_scatter_ls_task(void *task_args);
/* Only work with regular situation (each node has equal number of processes) */
static inline void
mca_coll_han_set_scatter_args(mca_coll_han_scatter_args_t * args,
mca_coll_task_t * cur_task,
void *sbuf,
void *sbuf_inter_free,
void *sbuf_reorder_free,
int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
struct ompi_datatype_t *rdtype,
int root,
int root_up_rank,
int root_low_rank,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank, bool noop, ompi_request_t * req)
{
args->cur_task = cur_task;
args->sbuf = sbuf;
args->sbuf_inter_free = sbuf_inter_free;
args->sbuf_reorder_free = sbuf_reorder_free;
args->scount = scount;
args->sdtype = sdtype;
args->rbuf = rbuf;
args->rcount = rcount;
args->rdtype = rdtype;
args->root = root;
args->root_up_rank = root_up_rank;
args->root_low_rank = root_low_rank;
args->up_comm = up_comm;
args->low_comm = low_comm;
args->w_rank = w_rank;
args->noop = noop;
args->req = req;
}
int
mca_coll_han_scatter_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
int i, j, w_rank, w_size;
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/* Create the subcommunicators */
if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle scatter with this communicator. Fall back on another component\n"));
/* HAN cannot work with this communicator so fallback on all collectives */
HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm);
return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
comm, comm->c_coll->coll_scatter_module);
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
int* topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle scatter with this communicator (imbalance). Fall back on another component\n"));
/* Put back the fallback collective support and call it once. All
* future calls will then be automatically redirected.
*/
HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatter);
return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root,
comm, comm->c_coll->coll_scatter_module);
}
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module];
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int up_size = ompi_comm_size(up_comm);
/* Set up request */
ompi_request_t *temp_request = OBJ_NEW(ompi_request_t);
temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = han_request_free;
temp_request->req_status = (ompi_status_public_t){0};
temp_request->req_complete = REQUEST_PENDING;
int root_low_rank;
int root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank,
root, root_low_rank, root_up_rank));
/* Reorder sbuf based on rank.
* Suppose, message is 0 1 2 3 4 5 6 7
* and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7),
* so the message needs to be reordered to 0 2 4 6 1 3 5 7
*/
char *reorder_buf = NULL;
char *reorder_sbuf = NULL;
if (w_rank == root) {
/* If the processes are mapped-by core, no need to reorder */
if (han_module->is_mapbycore) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Scatter is_bycore: ", w_rank));
reorder_sbuf = (char *) sbuf;
} else {
ptrdiff_t ssize, sgap = 0, sextent;
ompi_datatype_type_extent(sdtype, &sextent);
ssize = opal_datatype_span(&sdtype->super, (int64_t) scount * w_size, &sgap);
reorder_buf = (char *) malloc(ssize);
reorder_sbuf = reorder_buf - sgap;
for (i = 0; i < up_size; i++) {
for (j = 0; j < low_size; j++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Scatter copy from %d %d\n", w_rank,
(i * low_size + j) * 2 + 1,
topo[(i * low_size + j) * 2 + 1]));
ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t) scount,
reorder_sbuf + sextent * (i * low_size +
j) *
(ptrdiff_t) scount,
(char *) sbuf +
sextent *
(ptrdiff_t) topo[(i * low_size + j) * 2 +
1] * (ptrdiff_t) scount);
}
}
}
}
void *dest_buf = rbuf;
int dest_count = rcount;
ompi_datatype_t *dest_dtype = rdtype;
if (MPI_IN_PLACE == rbuf) {
dest_buf = (void*)sbuf;
dest_count = scount;
dest_dtype = sdtype;
}
/* Create us task */
mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t);
/* Setup us task arguments */
mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t));
mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype,
(char *) dest_buf, dest_count, dest_dtype, root, root_up_rank, root_low_rank,
up_comm, low_comm, w_rank, low_rank != root_low_rank,
temp_request);
/* Init us task */
init_task(us, mca_coll_han_scatter_us_task, (void *) (us_args));
/* Issure us task */
issue_task(us);
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
return OMPI_SUCCESS;
}
/* us: upper level (intra-node) scatter task */
int mca_coll_han_scatter_us_task(void *task_args)
{
mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args;
if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n",
t->w_rank));
} else {
size_t count;
ompi_datatype_t *dtype;
if (t->w_rank == t->root) {
dtype = t->sdtype;
count = t->scount;
} else {
dtype = t->rdtype;
count = t->rcount;
}
int low_size = ompi_comm_size(t->low_comm);
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&dtype->super, (int64_t) count * low_size, &rgap);
char *tmp_buf = (char *) malloc(rsize);
char *tmp_rbuf = tmp_buf - rgap;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Han Scatter: us scatter\n", t->w_rank));
/* Inter node scatter */
t->up_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount * low_size, t->sdtype,
tmp_rbuf, t->rcount * low_size, t->rdtype, t->root_up_rank,
t->up_comm, t->up_comm->c_coll->coll_scatter_module);
t->sbuf = tmp_rbuf;
t->sbuf_inter_free = tmp_buf;
}
if (t->sbuf_reorder_free != NULL && t->root == t->w_rank) {
free(t->sbuf_reorder_free);
t->sbuf_reorder_free = NULL;
}
/* Create ls tasks for the current union segment */
mca_coll_task_t *ls = t->cur_task;
/* Init ls task */
init_task(ls, mca_coll_han_scatter_ls_task, (void *) t);
/* Issure ls task */
issue_task(ls);
return OMPI_SUCCESS;
}
/* ls: lower level (shared memory or intra-node) scatter task */
int mca_coll_han_scatter_ls_task(void *task_args)
{
mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n",
t->w_rank));
OBJ_RELEASE(t->cur_task);
t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf,
t->rcount, t->rdtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_scatter_module);
if (t->sbuf_inter_free != NULL && t->noop != true) {
free(t->sbuf_inter_free);
t->sbuf_inter_free = NULL;
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls finish\n",
t->w_rank));
ompi_request_t *temp_req = t->req;
free(t);
ompi_request_complete(temp_req, 1);
return OMPI_SUCCESS;
}

333
ompi/mca/coll/han/coll_han_subcomms.c Обычный файл
Просмотреть файл

@ -0,0 +1,333 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Warning: this is not for the faint of heart -- don't even bother
* reading this source code if you don't have a strong understanding
* of nested data structures and pointer math (remember that
* associativity and order of C operations is *critical* in terms of
* pointer math!).
*/
#include "ompi_config.h"
#include "mpi.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#define HAN_SUBCOM_SAVE_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \
do { \
(FALLBACKS).COLL.COLL = (COMM)->c_coll->coll_ ## COLL; \
(FALLBACKS).COLL.module = (COMM)->c_coll->coll_ ## COLL ## _module; \
(COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \
(COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \
} while(0)
#define HAN_SUBCOM_LOAD_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \
do { \
(COMM)->c_coll->coll_ ## COLL = (FALLBACKS).COLL.COLL; \
(COMM)->c_coll->coll_ ## COLL ## _module = (FALLBACKS).COLL.module; \
} while(0)
/*
* Routine that creates the local hierarchical sub-communicators
* Called each time a collective is called.
* comm: input communicator of the collective
*/
int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module)
{
int low_rank, low_size, up_rank, w_rank, w_size;
ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]);
ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]);
mca_coll_han_collectives_fallback_t fallbacks;
int vrank, *vranks;
opal_info_t comm_info;
/* The sub communicators have already been created */
if (han_module->enabled && NULL != han_module->sub_comm[INTRA_NODE]
&& NULL != han_module->sub_comm[INTER_NODE]
&& NULL != han_module->cached_vranks) {
return OMPI_SUCCESS;
}
/*
* We cannot use han allreduce and allgather without sub-communicators,
* but we are in the creation of the data structures for the HAN, and
* temporarily need to save back the old collective.
*
* Allgather is used to compute vranks
* Allreduce is used by ompi_comm_split_type in create_intranode_comm_new
* Reduce + Bcast may be called by the allreduce implementation
* Gather + Bcast may be called by the allgather implementation
*/
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter);
/**
* HAN is not yet optimized for a single process per node case, we should
* avoid selecting it for collective communication support in such cases.
* However, in order to decide if this is tru, we need to know how many
* local processes are on each node, a condition that cannot be verified
* outside the MPI support (with PRRTE the info will be eventually available,
* but we don't want to delay anything until then). We can achieve the same
* goal by using a reduction over the maximum number of peers per node among
* all participants.
*/
int local_procs = ompi_group_count_local_peers(comm->c_local_group);
comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT,
MPI_MAX, comm,
comm->c_coll->coll_allreduce_module);
if( local_procs == 1 ) {
/* restore saved collectives */
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
han_module->enabled = false; /* entire module set to pass-through from now on */
return OMPI_ERR_NOT_SUPPORTED;
}
OBJ_CONSTRUCT(&comm_info, opal_info_t);
/* Create topological sub-communicators */
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/*
* This sub-communicator contains the ranks that share my node.
*/
opal_info_set(&comm_info, "ompi_comm_coll_preference", "han");
opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, low_comm);
/*
* Get my local rank and the local size
*/
low_size = ompi_comm_size(*low_comm);
low_rank = ompi_comm_rank(*low_comm);
/*
* This sub-communicator contains one process per node: processes with the
* same intra-node rank id share such a sub-communicator
*/
opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE");
ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false);
up_rank = ompi_comm_rank(*up_comm);
/*
* Set my virtual rank number.
* my rank # = <intra-node comm size> * <inter-node rank number>
* + <intra-node rank number>
* WARNING: this formula works only if the ranks are perfectly spread over
* the nodes
* TODO: find a better way of doing
*/
vrank = low_size * up_rank + low_rank;
vranks = (int *)malloc(sizeof(int) * w_size);
/*
* gather vrank from each process so every process will know other processes
* vrank
*/
comm->c_coll->coll_allgather(&vrank,
1,
MPI_INT,
vranks,
1,
MPI_INT,
comm,
comm->c_coll->coll_allgather_module);
/*
* Set the cached info
*/
han_module->cached_vranks = vranks;
/* Reset the saved collectives to point back to HAN */
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
OBJ_DESTRUCT(&comm_info);
return OMPI_SUCCESS;
}
/*
* Routine that creates the local hierarchical sub-communicators
* Called each time a collective is called.
* comm: input communicator of the collective
*/
int mca_coll_han_comm_create(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module)
{
int low_rank, low_size, up_rank, w_rank, w_size;
mca_coll_han_collectives_fallback_t fallbacks;
ompi_communicator_t **low_comms;
ompi_communicator_t **up_comms;
int vrank, *vranks;
opal_info_t comm_info;
/* use cached communicators if possible */
if (han_module->enabled && han_module->cached_low_comms != NULL &&
han_module->cached_up_comms != NULL &&
han_module->cached_vranks != NULL) {
return OMPI_SUCCESS;
}
/*
* We cannot use han allreduce and allgather without sub-communicators,
* but we are in the creation of the data structures for the HAN, and
* temporarily need to save back the old collective.
*
* Allgather is used to compute vranks
* Allreduce is used by ompi_comm_split_type in create_intranode_comm_new
* Reduce + Bcast may be called by the allreduce implementation
* Gather + Bcast may be called by the allgather implementation
*/
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter);
/**
* HAN is not yet optimized for a single process per node case, we should
* avoid selecting it for collective communication support in such cases.
* However, in order to decide if this is tru, we need to know how many
* local processes are on each node, a condition that cannot be verified
* outside the MPI support (with PRRTE the info will be eventually available,
* but we don't want to delay anything until then). We can achieve the same
* goal by using a reduction over the maximum number of peers per node among
* all participants.
*/
int local_procs = ompi_group_count_local_peers(comm->c_local_group);
comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT,
MPI_MAX, comm,
comm->c_coll->coll_allreduce_module);
if( local_procs == 1 ) {
/* restore saved collectives */
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
han_module->enabled = false; /* entire module set to pass-through from now on */
return OMPI_ERR_NOT_SUPPORTED;
}
/* create communicators if there is no cached communicator */
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES *
sizeof(struct ompi_communicator_t *));
up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES *
sizeof(struct ompi_communicator_t *));
OBJ_CONSTRUCT(&comm_info, opal_info_t);
/*
* Upgrade sm module priority to set up low_comms[0] with sm module
* This sub-communicator contains the ranks that share my node.
*/
opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, &(low_comms[0]));
/*
* Get my local rank and the local size
*/
low_size = ompi_comm_size(low_comms[0]);
low_rank = ompi_comm_rank(low_comms[0]);
/*
* Upgrade shared module priority to set up low_comms[1] with shared module
* This sub-communicator contains the ranks that share my node.
*/
opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, &(low_comms[1]));
/*
* Upgrade libnbc module priority to set up up_comms[0] with libnbc module
* This sub-communicator contains one process per node: processes with the
* same intra-node rank id share such a sub-communicator
*/
opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han");
ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false);
up_rank = ompi_comm_rank(up_comms[0]);
/*
* Upgrade adapt module priority to set up up_comms[0] with adapt module
* This sub-communicator contains one process per node.
*/
opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han");
ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false);
/*
* Set my virtual rank number.
* my rank # = <intra-node comm size> * <inter-node rank number>
* + <intra-node rank number>
* WARNING: this formula works only if the ranks are perfectly spread over
* the nodes
* TODO: find a better way of doing
*/
vrank = low_size * up_rank + low_rank;
vranks = (int *)malloc(sizeof(int) * w_size);
/*
* gather vrank from each process so every process will know other processes
* vrank
*/
comm->c_coll->coll_allgather(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm,
comm->c_coll->coll_allgather_module);
/*
* Set the cached info
*/
han_module->cached_low_comms = low_comms;
han_module->cached_up_comms = up_comms;
han_module->cached_vranks = vranks;
/* Reset the saved collectives to point back to HAN */
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather);
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter);
OBJ_DESTRUCT(&comm_info);
return OMPI_SUCCESS;
}

195
ompi/mca/coll/han/coll_han_topo.c Обычный файл
Просмотреть файл

@ -0,0 +1,195 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Warning: this is not for the faint of heart -- don't even bother
* reading this source code if you don't have a strong understanding
* of nested data structures and pointer math (remember that
* associativity and order of C operations is *critical* in terms of
* pointer math!).
*/
#include "ompi_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include "mpi.h"
#include "coll_han.h"
#if OPAL_ENABLE_DEBUG
static void
mca_coll_han_topo_print(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level)
{
int rank = ompi_comm_rank(comm);
int size = ompi_comm_size(comm);
if (rank == 0) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank));
for( int i = 0; i < size*num_topo_level; i++ ) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i]));
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n"));
}
}
#endif /* OPAL_ENABLE_DEBUG */
/**
* Topology initialization phase
* Called each time a collective that needs buffer reordering is called
*
* @param num_topo_level (IN) Number of the topological levels
*/
int*
mca_coll_han_topo_init(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module,
int num_topo_level)
{
if ( NULL != han_module->cached_topo ) {
return han_module->cached_topo;
}
ompi_communicator_t *up_comm, *low_comm;
ompi_request_t *request = MPI_REQUEST_NULL;
int *my_low_rank_map = NULL;
int *ranks_map = NULL;
int size = ompi_comm_size(comm);
if (NULL != han_module->cached_up_comms) {
up_comm = han_module->cached_up_comms[0];
low_comm = han_module->cached_low_comms[0];
} else {
up_comm = han_module->sub_comm[INTER_NODE];
low_comm = han_module->sub_comm[INTRA_NODE];
}
assert(up_comm != NULL && low_comm != NULL);
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int *topo = (int *)malloc(sizeof(int) * size * num_topo_level);
int is_imbalanced = 1;
int ranks_consecutive = 1;
/* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */
if (0 == low_rank) {
my_low_rank_map = malloc(sizeof(int)*low_size);
for (int i = 0; i < low_size; ++i) {
topo[i] = i;
}
ompi_group_translate_ranks(low_comm->c_local_group, low_size, topo,
comm->c_local_group, my_low_rank_map);
/* check if ranks are consecutive */
int rank = my_low_rank_map[0] + 1;
for (int i = 1; i < low_size; ++i, ++rank) {
if (my_low_rank_map[i] != rank) {
ranks_consecutive = 0;
break;
}
}
int reduce_vals[] = {ranks_consecutive, -ranks_consecutive, low_size, -low_size};
up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4,
MPI_INT, MPI_MAX, up_comm,
up_comm->c_coll->coll_allreduce_module);
/* is the distribution of processes balanced per node? */
is_imbalanced = (reduce_vals[2] == -reduce_vals[3]) ? 0 : 1;
ranks_consecutive = (reduce_vals[0] == -reduce_vals[1]) ? 1 : 0;
if ( !ranks_consecutive && !is_imbalanced ) {
/* kick off up_comm allgather to collect non-consecutive rank information at node leaders */
ranks_map = malloc(sizeof(int)*size);
up_comm->c_coll->coll_iallgather(my_low_rank_map, low_size, MPI_INT,
ranks_map, low_size, MPI_INT, up_comm, &request,
up_comm->c_coll->coll_iallgather_module);
}
}
/* broadcast balanced and consecutive properties from node leaders to remaining ranks */
int bcast_vals[] = {is_imbalanced, ranks_consecutive};
low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0,
low_comm, low_comm->c_coll->coll_bcast_module);
is_imbalanced = bcast_vals[0];
ranks_consecutive = bcast_vals[1];
/* error out if the rank distribution is not balanced */
if (is_imbalanced) {
assert(MPI_REQUEST_NULL == request);
han_module->are_ppn_imbalanced = true;
free(topo);
if( NULL != my_low_rank_map ) free(my_low_rank_map);
if( NULL != ranks_map ) free(ranks_map);
return NULL;
}
han_module->are_ppn_imbalanced = false;
if (ranks_consecutive) {
/* fast-path: all ranks are consecutive and balanced so fill topology locally */
for (int i = 0; i < size; ++i) {
topo[2*i] = (i/low_size); // node leader is node ID
topo[2*i+1] = i;
}
han_module->is_mapbycore = true;
} else {
/*
* Slow path: gather global-to-node-local rank mappings at node leaders
*
* The topology will contain a mapping from global consecutive positions
* to ranks in the communicator.
*
* ex: 4 ranks executing on 2 nodes, mapped by node
* ranks 0 and 2 on hid0
* ranks 1 and 3 on hid1
* On entry the topo array looks like
* hid0 0 hid1 1 hid0 2 hid1 3
* After the sort:
* hid0 0 hid0 2 hid1 1 hid1 3
*/
if (0 == low_rank) {
ompi_request_wait(&request, MPI_STATUS_IGNORE);
/* fill topology */
for (int i = 0; i < size; ++i) {
topo[2*i] = ranks_map[(i/low_size)*low_size]; // node leader is node ID
topo[2*i+1] = ranks_map[i];
}
free(ranks_map);
}
}
/* broadcast topology from node leaders to remaining ranks */
low_comm->c_coll->coll_bcast(topo, num_topo_level*size, MPI_INT, 0,
low_comm, low_comm->c_coll->coll_bcast_module);
free(my_low_rank_map);
han_module->cached_topo = topo;
#if OPAL_ENABLE_DEBUG
mca_coll_han_topo_print(topo, comm, num_topo_level);
#endif /* OPAL_ENABLE_DEBUG */
return topo;
}

27
ompi/mca/coll/han/coll_han_trigger.c Обычный файл
Просмотреть файл

@ -0,0 +1,27 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_han_trigger.h"
static void mca_coll_task_constructor(mca_coll_task_t * t)
{
t->func_ptr = NULL;
t->func_args = NULL;
}
static void mca_coll_task_destructor(mca_coll_task_t * t)
{
t->func_ptr = NULL;
t->func_args = NULL;
}
OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor,
mca_coll_task_destructor);

49
ompi/mca/coll/han/coll_han_trigger.h Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H
#define MCA_COLL_HAN_TRIGGER_EXPORT_H
#include "ompi/communicator/communicator.h"
#include "ompi/op/op.h"
#include "ompi/datatype/ompi_datatype.h"
typedef int (*task_func_ptr) (void *);
struct mca_coll_task_s {
opal_object_t super;
task_func_ptr func_ptr;
void *func_args;
};
typedef struct mca_coll_task_s mca_coll_task_t;
OBJ_CLASS_DECLARATION(mca_coll_task_t);
/* Init task */
static inline int
init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args)
{
OBJ_CONSTRUCT(t, mca_coll_task_t);
t->func_ptr = func_ptr;
t->func_args = func_args;
return OMPI_SUCCESS;
}
/* Issue the task */
static inline int
issue_task(mca_coll_task_t * t)
{
return t->func_ptr(t->func_args);
}
#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */

Просмотреть файл

@ -176,7 +176,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority)
if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name);
return NULL;
return NULL;
}
/* Get the priority level attached to this module. If priority is less

Просмотреть файл

@ -1446,7 +1446,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount,
communicator_size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
if (rank == root) {
/* Determine block size */
if ( (rank == root) || (MPI_IN_PLACE == sbuf) ) {
ompi_datatype_type_size(rdtype, &dsize);
total_dsize = dsize * (ptrdiff_t)rcount;
} else {

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -29,19 +29,19 @@
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "ompi/mca/coll/base/coll_base_topo.h"
/* need file reading function */
#include "ompi/mca/coll/base/coll_base_util.h"
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
/* and our own prototypes */
#include "coll_tuned_dynamic_file.h"
#define MYEOF -999
static long getnext (FILE *fptr); /* local function */
static int fileline=0; /* used for verbose error messages */
#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval)
/*
* Reads a rule file called fname
* Builds the algorithm rule table for a max of n_collectives
@ -56,9 +56,8 @@ static int fileline=0; /* used for verbose error messages */
int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives)
{
long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS;
FILE *fptr = (FILE*) NULL;
int X, CI, NCS, CS, ALG, NMS, FANINOUT;
long MS, SS;
int x, ncs, nms;
ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */
@ -101,45 +100,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
goto on_file_error;
}
X = (int)getnext(fptr);
if (X<0) {
if( (getnext(fptr, &X) < 0) || (X < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline));
goto on_file_error;
}
if (X>n_collectives) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline));
goto on_file_error;
}
for (x=0;x<X;x++) { /* for each collective */
CI = (int)getnext (fptr);
if (CI<0) {
if( (getnext(fptr, &CI) < 0) || (CI < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline));
goto on_file_error;
}
if (CI>=n_collectives) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline));
goto on_file_error;
}
if (alg_rules[CI].alg_rule_id != CI) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI));
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI));
goto on_file_error;
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI));
OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI));
alg_p = &alg_rules[CI];
alg_p->alg_rule_id = CI;
alg_p->n_com_sizes = 0;
alg_p->com_rules = (ompi_coll_com_rule_t *) NULL;
NCS = (int)getnext (fptr);
if (NCS<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline));
if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline));
goto on_file_error;
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI));
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI));
alg_p->n_com_sizes = NCS;
alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI);
if (NULL == alg_p->com_rules) {
@ -151,20 +147,18 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
com_p = &(alg_p->com_rules[ncs]);
CS = (int)getnext (fptr);
if (CS<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
if( (getnext (fptr, &CS) < 0) || (CS < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline));
goto on_file_error;
}
com_p->mpi_comsize = CS;
NMS = (int)getnext (fptr);
if (NMS<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline));
goto on_file_error;
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n",
NMS, CI, CS));
com_p->n_msg_sizes = NMS;
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
@ -179,37 +173,33 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
msg_p = &(com_p->msg_rules[nms]);
MS = getnext (fptr);
if (MS<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
if( (getnext (fptr, &MS) < 0) || (MS < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error;
}
msg_p->msg_size = (size_t)MS;
ALG = (int)getnext (fptr);
if (ALG<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error;
}
msg_p->result_alg = ALG;
FANINOUT = (int)getnext (fptr);
if (FANINOUT<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error;
}
msg_p->result_topo_faninout = FANINOUT;
SS = getnext (fptr);
if (SS<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
if( (getnext (fptr, &SS) < 0) || (SS < 0) ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
goto on_file_error;
}
msg_p->result_segsize = SS;
if (!nms && MS) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n"));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline));
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline));
goto on_file_error;
}
@ -222,7 +212,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
} /* comm size */
total_alg_count++;
OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI));
OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI));
} /* per collective */
@ -261,36 +251,3 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
return (-1);
}
static void skiptonewline (FILE *fptr)
{
char val;
int rc;
do {
rc = fread(&val, 1, 1, fptr);
if (0 == rc) return;
if ((1 == rc)&&('\n' == val)) {
fileline++;
return;
}
} while (1);
}
static long getnext (FILE *fptr)
{
long val;
int rc;
char trash;
do {
rc = fscanf(fptr, "%li", &val);
if (rc == EOF) return MYEOF;
if (1 == rc) return val;
/* in all other cases, skip to the end */
rc = fread(&trash, 1, 1, fptr);
if (rc == EOF) return MYEOF;
if ('\n' == trash) fileline++;
if ('#' == trash) skiptonewline (fptr);
} while (1);
}

Просмотреть файл

@ -54,7 +54,7 @@ static void ompi_request_construct(ompi_request_t* req)
/* don't call _INIT, we don't to set the request to _INACTIVE and there will
* be no matching _FINI invocation */
req->req_state = OMPI_REQUEST_INVALID;
req->req_complete = false;
req->req_complete = REQUEST_COMPLETED;
req->req_persistent = false;
req->req_start = NULL;
req->req_free = NULL;

Просмотреть файл

@ -380,7 +380,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
}
complete_loop:
assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
if( 0 != iov_len_local ) {
if( (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) && (0 != iov_len_local) ) {
unsigned char* temp = conv_ptr;
/* We have some partial data here. Let's copy it into the convertor
* and keep it hot until the next round.
@ -391,7 +391,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
opal_unpack_partial_datatype( pConvertor, pElem,
iov_ptr, 0, iov_len_local,
&temp );
pConvertor->partial_length = iov_len_local;
iov_len_local = 0;
}