1
1

Many fixes and improvements to ADAPT

- Add support for fallback to previous coll module on non-commutative operations (#30)
- Replace mutexes by atomic operations.
- Use the correct nbc request type (for both ibcast and ireduce)
  * coll/base: document type casts in ompi_coll_base_retain_*
- add module-wide topology cache
- use standard instead of synchronous send and add mca parameter to control mode of initial send in ireduce/ibcast
- reduce number of memory allocations
- call the default request completion.
  - Remove the requests from the Fortran lookup conversion tables before completing
    and free it.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Signed-off-by: Joseph Schuchart <schuchart@hlrs.de>

Co-authored-by: Joseph Schuchart <schuchart@hlrs.de>
Этот коммит содержится в:
George Bosilca 2020-09-08 15:36:26 -04:00
родитель 43e3addca6
Коммит c98e387a53
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 09C926752C9F09B1
11 изменённых файлов: 630 добавлений и 617 удалений

Просмотреть файл

@ -24,7 +24,9 @@ sources = \
coll_adapt_inbuf.c \
coll_adapt_inbuf.h \
coll_adapt_item.c \
coll_adapt_item.h
coll_adapt_item.h \
coll_adapt_topocache.c \
coll_adapt_topocache.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -3,9 +3,9 @@
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -25,6 +25,17 @@ BEGIN_C_DECLS
typedef struct mca_coll_adapt_module_t mca_coll_adapt_module_t;
typedef enum {
OMPI_COLL_ADAPT_ALGORITHM_TUNED = 0,
OMPI_COLL_ADAPT_ALGORITHM_BINOMIAL,
OMPI_COLL_ADAPT_ALGORITHM_IN_ORDER_BINOMIAL,
OMPI_COLL_ADAPT_ALGORITHM_BINARY,
OMPI_COLL_ADAPT_ALGORITHM_PIPELINE,
OMPI_COLL_ADAPT_ALGORITHM_CHAIN,
OMPI_COLL_ADAPT_ALGORITHM_LINEAR,
OMPI_COLL_ADAPT_ALGORITHM_COUNT /* number of algorithms, keep last! */
} ompi_coll_adapt_algorithm_t;
/*
* Structure to hold the adapt coll component. First it holds the
* base coll component, and then holds a bunch of
@ -56,6 +67,7 @@ typedef struct mca_coll_adapt_component_t {
size_t adapt_ibcast_segment_size;
int adapt_ibcast_max_send_requests;
int adapt_ibcast_max_recv_requests;
bool adapt_ibcast_synchronous_send;
/* Bcast free list */
opal_free_list_t *adapt_ibcast_context_free_list;
@ -67,17 +79,54 @@ typedef struct mca_coll_adapt_component_t {
int adapt_inbuf_free_list_min;
int adapt_inbuf_free_list_max;
int adapt_inbuf_free_list_inc;
bool adapt_ireduce_synchronous_send;
/* Reduce free list */
opal_free_list_t *adapt_ireduce_context_free_list;
} mca_coll_adapt_component_t;
/*
* Structure used to store what is necessary for the collective operations
* routines in case of fallback.
*/
typedef struct mca_coll_adapt_collective_fallback_s {
union {
mca_coll_base_module_reduce_fn_t reduce;
mca_coll_base_module_ireduce_fn_t ireduce;
} previous_routine;
mca_coll_base_module_t *previous_module;
} mca_coll_adapt_collective_fallback_t;
typedef enum mca_coll_adapt_colltype {
ADAPT_REDUCE = 0,
ADAPT_IREDUCE = 1,
ADAPT_COLLCOUNT
} mca_coll_adapt_colltype_t;
/*
* Some defines to stick to the naming used in the other components in terms of
* fallback routines
*/
#define previous_reduce previous_routines[ADAPT_REDUCE].previous_routine.reduce
#define previous_ireduce previous_routines[ADAPT_IREDUCE].previous_routine.ireduce
#define previous_reduce_module previous_routines[ADAPT_REDUCE].previous_module
#define previous_ireduce_module previous_routines[ADAPT_IREDUCE].previous_module
/* Coll adapt module per communicator*/
struct mca_coll_adapt_module_t {
/* Base module */
mca_coll_base_module_t super;
/* To be able to fallback when the cases are not supported */
struct mca_coll_adapt_collective_fallback_s previous_routines[ADAPT_COLLCOUNT];
/* cached topologies */
opal_list_t *topo_cache;
/* Whether this module has been lazily initialized or not yet */
bool adapt_enabled;
};

Просмотреть файл

@ -3,9 +3,9 @@
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -13,6 +13,21 @@
#include "coll_adapt_context.h"
static void adapt_constant_reduce_context_construct(ompi_coll_adapt_constant_reduce_context_t *context)
{
OBJ_CONSTRUCT(&context->recv_list, opal_list_t);
OBJ_CONSTRUCT(&context->mutex_recv_list, opal_mutex_t);
OBJ_CONSTRUCT(&context->inbuf_list, opal_free_list_t);
}
static void adapt_constant_reduce_context_destruct(ompi_coll_adapt_constant_reduce_context_t *context)
{
OBJ_DESTRUCT(&context->mutex_recv_list);
OBJ_DESTRUCT(&context->recv_list);
OBJ_DESTRUCT(&context->inbuf_list);
}
OBJ_CLASS_INSTANCE(ompi_coll_adapt_bcast_context_t, opal_free_list_item_t,
NULL, NULL);
@ -23,4 +38,5 @@ OBJ_CLASS_INSTANCE(ompi_coll_adapt_reduce_context_t, opal_free_list_item_t,
NULL, NULL);
OBJ_CLASS_INSTANCE(ompi_coll_adapt_constant_reduce_context_t, opal_object_t,
NULL, NULL);
&adapt_constant_reduce_context_construct,
&adapt_constant_reduce_context_destruct);

Просмотреть файл

@ -3,9 +3,9 @@
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -74,20 +74,19 @@ struct ompi_coll_adapt_constant_reduce_context_s {
/* Increment of each segment */
int segment_increment;
int num_segs;
ompi_request_t *request;
int rank;
int root;
/* The distance between the address of inbuf->buff and the address of inbuf */
int distance;
int ireduce_tag;
/* How many sends are posted but not finished */
int32_t ongoing_send;
/* Length of the fragment array, which is the number of recevied segments */
int32_t num_recv_segs;
/* Number of sent segments */
int32_t num_sent_segs;
/* Next seg need to be received for every children */
opal_atomic_int32_t *next_recv_segs;
/* Mutex to protect recv_list */
opal_mutex_t *mutex_recv_list;
/* Mutex to protect num_recv_segs */
opal_mutex_t *mutex_num_recv_segs;
/* Mutex to protect num_sent */
opal_mutex_t *mutex_num_sent;
int32_t *next_recv_segs;
/* Mutex to protect each segment when do the reduce op */
opal_mutex_t *mutex_op_list;
/* Reduce operation */
@ -95,20 +94,15 @@ struct ompi_coll_adapt_constant_reduce_context_s {
ompi_coll_tree_t *tree;
/* Accumulate buff */
char **accumbuf;
/* inbuf list address of accumbuf */
ompi_coll_adapt_inbuf_t ** accumbuf_to_inbuf;
opal_free_list_t *inbuf_list;
/* A list to store the segments which are received and not yet be sent */
opal_list_t *recv_list;
ptrdiff_t lower_bound;
/* How many sends are posted but not finished */
opal_atomic_int32_t ongoing_send;
char *sbuf;
char *rbuf;
int root;
/* The distance between the address of inbuf->buff and the address of inbuf */
int distance;
int ireduce_tag;
opal_free_list_t inbuf_list;
/* Mutex to protect recv_list */
opal_mutex_t mutex_recv_list;
/* A list to store the segments which are received and not yet be sent */
opal_list_t recv_list;
ompi_request_t *request;
};
typedef struct ompi_coll_adapt_constant_reduce_context_s ompi_coll_adapt_constant_reduce_context_t;
@ -123,7 +117,7 @@ typedef int (*ompi_coll_adapt_reduce_cuda_callback_fn_t) (ompi_coll_adapt_reduce
struct ompi_coll_adapt_reduce_context_s {
opal_free_list_item_t super;
char *buff;
int frag_id;
int seg_index;
int child_id;
int peer;
ompi_coll_adapt_constant_reduce_context_t *con;

Просмотреть файл

@ -3,9 +3,9 @@
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -14,6 +14,7 @@
#include "coll_adapt.h"
#include "coll_adapt_algorithms.h"
#include "coll_adapt_context.h"
#include "coll_adapt_topocache.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "opal/util/bit_ops.h"
@ -22,31 +23,6 @@
static int ompi_coll_adapt_ibcast_generic(IBCAST_ARGS,
ompi_coll_tree_t * tree, size_t seg_size);
static int ompi_coll_adapt_ibcast_binomial(IBCAST_ARGS);
static int ompi_coll_adapt_ibcast_in_order_binomial(IBCAST_ARGS);
static int ompi_coll_adapt_ibcast_binary(IBCAST_ARGS);
static int ompi_coll_adapt_ibcast_pipeline(IBCAST_ARGS);
static int ompi_coll_adapt_ibcast_chain(IBCAST_ARGS);
static int ompi_coll_adapt_ibcast_linear(IBCAST_ARGS);
static int ompi_coll_adapt_ibcast_tuned(IBCAST_ARGS);
typedef int (*ompi_coll_adapt_ibcast_fn_t) (void *buff,
int count,
struct ompi_datatype_t * datatype,
int root,
struct ompi_communicator_t * comm,
ompi_request_t ** request,
mca_coll_base_module_t * module);
static ompi_coll_adapt_algorithm_index_t ompi_coll_adapt_ibcast_algorithm_index[] = {
{0, {ompi_coll_adapt_ibcast_tuned}},
{1, {ompi_coll_adapt_ibcast_binomial}},
{2, {ompi_coll_adapt_ibcast_in_order_binomial}},
{3, {ompi_coll_adapt_ibcast_binary}},
{4, {ompi_coll_adapt_ibcast_pipeline}},
{5, {ompi_coll_adapt_ibcast_chain}},
{6, {ompi_coll_adapt_ibcast_linear}},
};
/*
* Set up MCA parameters of MPI_Bcast and MPI_IBcast
@ -61,7 +37,7 @@ int ompi_coll_adapt_ibcast_register(void)
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
&mca_coll_adapt_component.adapt_ibcast_algorithm);
if( (mca_coll_adapt_component.adapt_ibcast_algorithm < 0) ||
(mca_coll_adapt_component.adapt_ibcast_algorithm > (int32_t)(sizeof(ompi_coll_adapt_ibcast_algorithm_index) / sizeof(ompi_coll_adapt_algorithm_index_t))) ) {
(mca_coll_adapt_component.adapt_ibcast_algorithm >= OMPI_COLL_ADAPT_ALGORITHM_COUNT) ) {
mca_coll_adapt_component.adapt_ibcast_algorithm = 1;
}
@ -89,6 +65,14 @@ int ompi_coll_adapt_ibcast_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&mca_coll_adapt_component.adapt_ibcast_max_recv_requests);
mca_coll_adapt_component.adapt_ibcast_synchronous_send = true;
(void) mca_base_component_var_register(c, "bcast_synchronous_send",
"Whether to use synchronous send operations during setup of bcast operations",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_coll_adapt_component.adapt_ibcast_synchronous_send);
mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL;
return OMPI_SUCCESS;
}
@ -120,8 +104,6 @@ static int ibcast_request_fini(ompi_coll_adapt_bcast_context_t * context)
}
OBJ_RELEASE(context->con->mutex);
OBJ_RELEASE(context->con);
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *) context);
ompi_request_complete(temp_req, 1);
return OMPI_SUCCESS;
@ -148,6 +130,8 @@ static int send_cb(ompi_request_t * req)
ompi_coll_adapt_bcast_context_t *send_context;
int new_id = context->con->recv_array[sent_id];
ompi_request_t *send_req;
++(context->con->send_array[context->child_id]);
OPAL_THREAD_UNLOCK(context->con->mutex);
send_context = (ompi_coll_adapt_bcast_context_t *) opal_free_list_wait(mca_coll_adapt_component.adapt_ibcast_context_free_list);
send_context->buff =
@ -160,7 +144,6 @@ static int send_cb(ompi_request_t * req)
if (new_id == (send_context->con->num_segs - 1)) {
send_count = send_context->con->count - new_id * send_context->con->seg_count;
}
++(send_context->con->send_array[send_context->child_id]);
char *send_buff = send_context->buff;
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output,
"[%d]: Send(start in send cb): segment %d to %d at buff %p send_count %d tag %d\n",
@ -170,16 +153,14 @@ static int send_cb(ompi_request_t * req)
err = MCA_PML_CALL(isend
(send_buff, send_count, send_context->con->datatype, send_context->peer,
send_context->con->ibcast_tag - new_id,
MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req));
MCA_PML_BASE_SEND_STANDARD, send_context->con->comm, &send_req));
if (MPI_SUCCESS != err) {
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *)send_context);
OPAL_THREAD_UNLOCK(context->con->mutex);
OBJ_RELEASE(context->con);
return err;
}
/* Set send callback */
OPAL_THREAD_UNLOCK(context->con->mutex);
ompi_request_set_callback(send_req, send_cb, send_context);
OPAL_THREAD_LOCK(context->con->mutex);
} else {
@ -190,25 +171,21 @@ static int send_cb(ompi_request_t * req)
int num_sent = ++(context->con->num_sent_segs);
int num_recv_fini = context->con->num_recv_fini;
int rank = ompi_comm_rank(context->con->comm);
OPAL_THREAD_UNLOCK(context->con->mutex);
/* Check whether signal the condition */
if ((rank == context->con->root
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs)
|| (context->con->tree->tree_nextsize > 0 && rank != context->con->root
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs
&& num_recv_fini == context->con->num_segs)
|| (context->con->tree->tree_nextsize == 0
&& num_recv_fini == context->con->num_segs)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n",
ompi_comm_rank(context->con->comm)));
OPAL_THREAD_UNLOCK(context->con->mutex);
ibcast_request_fini(context);
} else {
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *) context);
OPAL_THREAD_UNLOCK(context->con->mutex);
}
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *) context);
req->req_free(&req);
/* Call back function return 1, which means successful */
/* Call back function return 1 to signal that request has been free'd */
return 1;
}
@ -231,6 +208,7 @@ static int recv_cb(ompi_request_t * req)
OPAL_THREAD_LOCK(context->con->mutex);
int num_recv_segs = ++(context->con->num_recv_segs);
context->con->recv_array[num_recv_segs - 1] = context->frag_id;
OPAL_THREAD_UNLOCK(context->con->mutex);
int new_id = num_recv_segs + mca_coll_adapt_component.adapt_ibcast_max_recv_requests - 1;
/* Receive new segment */
@ -262,16 +240,21 @@ static int recv_cb(ompi_request_t * req)
recv_context->con->comm, &recv_req));
/* Set the receive callback */
OPAL_THREAD_UNLOCK(context->con->mutex);
ompi_request_set_callback(recv_req, recv_cb, recv_context);
OPAL_THREAD_LOCK(context->con->mutex);
}
OPAL_THREAD_LOCK(context->con->mutex);
/* Propagate segment to all children */
for (i = 0; i < context->con->tree->tree_nextsize; i++) {
/* If the current process can send the segment now, which means the only segment need to be sent is the just arrived one */
if (num_recv_segs - 1 == context->con->send_array[i]) {
ompi_request_t *send_req;
++(context->con->send_array[i]);
/* release mutex to avoid deadlock in case a callback is triggered below */
OPAL_THREAD_UNLOCK(context->con->mutex);
int send_count = context->con->seg_count;
if (context->frag_id == (context->con->num_segs - 1)) {
send_count = context->con->count - context->frag_id * context->con->seg_count;
@ -285,7 +268,6 @@ static int recv_cb(ompi_request_t * req)
send_context->peer = context->con->tree->tree_next[i];
send_context->con = context->con;
OBJ_RETAIN(context->con);
++(send_context->con->send_array[i]);
char *send_buff = send_context->buff;
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output,
"[%d]: Send(start in recv cb): segment %d to %d at buff %p send_count %d tag %d\n",
@ -297,17 +279,17 @@ static int recv_cb(ompi_request_t * req)
(send_buff, send_count, send_context->con->datatype,
send_context->peer,
send_context->con->ibcast_tag - send_context->frag_id,
MCA_PML_BASE_SEND_SYNCHRONOUS, send_context->con->comm, &send_req));
MCA_PML_BASE_SEND_STANDARD, send_context->con->comm, &send_req));
if (MPI_SUCCESS != err) {
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *)send_context);
OPAL_THREAD_UNLOCK(context->con->mutex);
OBJ_RELEASE(context->con);
return err;
}
/* Set send callback */
OPAL_THREAD_UNLOCK(context->con->mutex);
ompi_request_set_callback(send_req, send_cb, send_context);
/* retake the mutex for next iteration */
OPAL_THREAD_LOCK(context->con->mutex);
}
}
@ -316,26 +298,23 @@ static int recv_cb(ompi_request_t * req)
int num_sent = context->con->num_sent_segs;
int num_recv_fini = ++(context->con->num_recv_fini);
int rank = ompi_comm_rank(context->con->comm);
OPAL_THREAD_UNLOCK(context->con->mutex);
/* If this process is leaf and has received all the segments */
if ((rank == context->con->root
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs)
|| (context->con->tree->tree_nextsize > 0 && rank != context->con->root
if ((context->con->tree->tree_nextsize > 0
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs
&& num_recv_fini == context->con->num_segs)
|| (context->con->tree->tree_nextsize == 0
&& num_recv_fini == context->con->num_segs)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n",
ompi_comm_rank(context->con->comm)));
OPAL_THREAD_UNLOCK(context->con->mutex);
ibcast_request_fini(context);
} else {
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *) context);
OPAL_THREAD_UNLOCK(context->con->mutex);
}
opal_free_list_return(mca_coll_adapt_component.adapt_ibcast_context_free_list,
(opal_free_list_item_t *) context);
req->req_free(&req);
/* Call back function return 1 to signal that request has been free'd */
return 1;
}
@ -350,92 +329,13 @@ int ompi_coll_adapt_ibcast(void *buff, int count, struct ompi_datatype_t *dataty
mca_coll_adapt_component.adapt_ibcast_max_send_requests,
mca_coll_adapt_component.adapt_ibcast_max_recv_requests));
ompi_coll_adapt_ibcast_fn_t bcast_func =
ompi_coll_adapt_ibcast_algorithm_index[mca_coll_adapt_component.adapt_ibcast_algorithm].ibcast_fn_ptr;
return bcast_func(buff, count, datatype, root, comm, request, module);
}
/*
* Ibcast functions with different algorithms
*/
static int
ompi_coll_adapt_ibcast_tuned(void *buff, int count, struct ompi_datatype_t *datatype,
int root, struct ompi_communicator_t *comm,
ompi_request_t ** request,
mca_coll_base_module_t *module)
{
OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n"));
return OMPI_ERR_NOT_IMPLEMENTED;
}
static int
ompi_coll_adapt_ibcast_binomial(void *buff, int count, struct ompi_datatype_t *datatype,
int root, struct ompi_communicator_t *comm,
ompi_request_t ** request, mca_coll_base_module_t * module)
{
ompi_coll_tree_t *tree = ompi_coll_base_topo_build_bmtree(comm, root);
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree,
mca_coll_adapt_component.adapt_ibcast_segment_size);
}
static int
ompi_coll_adapt_ibcast_in_order_binomial(void *buff, int count, struct ompi_datatype_t *datatype,
int root, struct ompi_communicator_t *comm,
ompi_request_t ** request,
mca_coll_base_module_t * module)
{
ompi_coll_tree_t *tree = ompi_coll_base_topo_build_in_order_bmtree(comm, root);
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree,
mca_coll_adapt_component.adapt_ibcast_segment_size);
}
static int
ompi_coll_adapt_ibcast_binary(void *buff, int count, struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm, ompi_request_t ** request,
mca_coll_base_module_t * module)
{
ompi_coll_tree_t *tree = ompi_coll_base_topo_build_tree(2, comm, root);
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree,
mca_coll_adapt_component.adapt_ibcast_segment_size);
}
static int
ompi_coll_adapt_ibcast_pipeline(void *buff, int count, struct ompi_datatype_t *datatype,
int root, struct ompi_communicator_t *comm,
ompi_request_t ** request, mca_coll_base_module_t * module)
{
ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(1, comm, root);
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree,
mca_coll_adapt_component.adapt_ibcast_segment_size);
}
static int
ompi_coll_adapt_ibcast_chain(void *buff, int count, struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm, ompi_request_t ** request,
mca_coll_base_module_t * module)
{
ompi_coll_tree_t *tree = ompi_coll_base_topo_build_chain(4, comm, root);
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree,
mca_coll_adapt_component.adapt_ibcast_segment_size);
}
static int
ompi_coll_adapt_ibcast_linear(void *buff, int count, struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm, ompi_request_t ** request,
mca_coll_base_module_t * module)
{
int fanout = ompi_comm_size(comm) - 1;
ompi_coll_tree_t *tree;
if (fanout < 1) {
tree = ompi_coll_base_topo_build_chain(1, comm, root);
} else if (fanout <= MAXTREEFANOUT) {
tree = ompi_coll_base_topo_build_tree(ompi_comm_size(comm) - 1, comm, root);
} else {
tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root);
if (OMPI_COLL_ADAPT_ALGORITHM_TUNED == mca_coll_adapt_component.adapt_ibcast_algorithm) {
OPAL_OUTPUT_VERBOSE((10, mca_coll_adapt_component.adapt_output, "tuned not implemented\n"));
return OMPI_ERR_NOT_IMPLEMENTED;
}
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module, tree,
return ompi_coll_adapt_ibcast_generic(buff, count, datatype, root, comm, request, module,
adapt_module_cached_topology(module, comm, root, mca_coll_adapt_component.adapt_ibcast_algorithm),
mca_coll_adapt_component.adapt_ibcast_segment_size);
}
@ -459,8 +359,11 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t
/* Number of segments */
int num_segs;
mca_pml_base_send_mode_t sendmode = (mca_coll_adapt_component.adapt_ibcast_synchronous_send)
? MCA_PML_BASE_SEND_SYNCHRONOUS : MCA_PML_BASE_SEND_STANDARD;
/* The request passed outside */
ompi_request_t *temp_request = NULL;
ompi_coll_base_nbc_request_t *temp_request = NULL;
opal_mutex_t *mutex;
/* Store the segments which are received */
int *recv_array = NULL;
@ -486,17 +389,17 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t
}
/* Set up request */
temp_request = OBJ_NEW(ompi_request_t);
OMPI_REQUEST_INIT(temp_request, false);
temp_request->req_state = OMPI_REQUEST_ACTIVE;
temp_request->req_type = OMPI_REQUEST_COLL;
temp_request->req_free = ompi_coll_adapt_request_free;
temp_request->req_status.MPI_SOURCE = 0;
temp_request->req_status.MPI_TAG = 0;
temp_request->req_status.MPI_ERROR = 0;
temp_request->req_status._cancelled = 0;
temp_request->req_status._ucount = 0;
*request = temp_request;
temp_request = OBJ_NEW(ompi_coll_base_nbc_request_t);
OMPI_REQUEST_INIT(&temp_request->super, false);
temp_request->super.req_state = OMPI_REQUEST_ACTIVE;
temp_request->super.req_type = OMPI_REQUEST_COLL;
temp_request->super.req_free = ompi_coll_adapt_request_free;
temp_request->super.req_status.MPI_SOURCE = 0;
temp_request->super.req_status.MPI_TAG = 0;
temp_request->super.req_status.MPI_ERROR = 0;
temp_request->super.req_status._cancelled = 0;
temp_request->super.req_status._ucount = 0;
*request = (ompi_request_t*)temp_request;
/* Set up mutex */
mutex = OBJ_NEW(opal_mutex_t);
@ -534,7 +437,7 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t
con->send_array = send_array;
con->num_sent_segs = 0;
con->mutex = mutex;
con->request = temp_request;
con->request = (ompi_request_t*)temp_request;
con->tree = tree;
con->ibcast_tag = ompi_coll_base_nbc_reserve_tags(comm, num_segs);
@ -595,7 +498,7 @@ int ompi_coll_adapt_ibcast_generic(void *buff, int count, struct ompi_datatype_t
err =
MCA_PML_CALL(isend
(send_buff, send_count, datatype, context->peer,
con->ibcast_tag - i, MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
con->ibcast_tag - i, sendmode, comm,
&send_req));
if (MPI_SUCCESS != err) {
return err;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -41,6 +41,7 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_adapt_algorithms.h"
#include "coll_adapt_topocache.h"
/*
@ -52,6 +53,7 @@
*/
static void adapt_module_construct(mca_coll_adapt_module_t * module)
{
module->topo_cache = NULL;
module->adapt_enabled = false;
}
@ -60,6 +62,14 @@ static void adapt_module_construct(mca_coll_adapt_module_t * module)
*/
static void adapt_module_destruct(mca_coll_adapt_module_t * module)
{
if (NULL != module->topo_cache) {
adapt_topology_cache_item_t *item;
while (NULL != (item = (adapt_topology_cache_item_t*)opal_list_remove_first(module->topo_cache))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(module->topo_cache);
module->topo_cache = NULL;
}
module->adapt_enabled = false;
}
@ -69,12 +79,37 @@ OBJ_CLASS_INSTANCE(mca_coll_adapt_module_t,
adapt_module_construct,
adapt_module_destruct);
/*
* In this macro, the following variables are supposed to have been declared
* in the caller:
* . ompi_communicator_t *comm
* . mca_coll_adapt_module_t *adapt_module
*/
#define ADAPT_SAVE_PREV_COLL_API(__api) \
do { \
adapt_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \
adapt_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \
if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \
opal_output_verbose(1, ompi_coll_base_framework.framework_output, \
"(%d/%s): no underlying " # __api"; disqualifying myself", \
comm->c_contextid, comm->c_name); \
return OMPI_ERROR; \
} \
OBJ_RETAIN(adapt_module->previous_ ## __api ## _module); \
} while(0)
/*
* Init module on the communicator
*/
static int adapt_module_enable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{
mca_coll_adapt_module_t * adapt_module = (mca_coll_adapt_module_t*) module;
ADAPT_SAVE_PREV_COLL_API(reduce);
ADAPT_SAVE_PREV_COLL_API(ireduce);
return OMPI_SUCCESS;
}
@ -157,6 +192,7 @@ mca_coll_base_module_t *ompi_coll_adapt_comm_query(struct ompi_communicator_t *
*/
int ompi_coll_adapt_request_free(ompi_request_t ** request)
{
OMPI_REQUEST_FINI(*request);
(*request)->req_state = OMPI_REQUEST_INVALID;
OBJ_RELEASE(*request);
*request = MPI_REQUEST_NULL;

Просмотреть файл

@ -9,6 +9,8 @@
* $HEADER$
*/
#include "ompi/op/op.h"
#include "coll_adapt.h"
#include "coll_adapt_algorithms.h"
@ -17,6 +19,16 @@ int ompi_coll_adapt_reduce(const void *sbuf, void *rbuf, int count, struct ompi_
struct ompi_op_t *op, int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t * module)
{
/* Fall-back if operation is commutative */
if (!ompi_op_is_commute(op)){
mca_coll_adapt_module_t *adapt_module = (mca_coll_adapt_module_t *) module;
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output,
"ADAPT cannot handle reduce with this (commutative) operation. It needs to fall back on another component\n"));
return adapt_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm,
adapt_module->previous_reduce_module);
}
ompi_request_t *request = NULL;
int err = ompi_coll_adapt_ireduce(sbuf, rbuf, count, dtype, op, root, comm, &request, module);
if( MPI_SUCCESS != err ) {

105
ompi/mca/coll/adapt/coll_adapt_topocache.c Обычный файл
Просмотреть файл

@ -0,0 +1,105 @@
/*
* Copyright (c) 2014-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "coll_adapt.h"
#include "coll_adapt_topocache.h"
#include "ompi/communicator/communicator.h"
static void destruct_topology_cache(adapt_topology_cache_item_t *item)
{
if (NULL != item->tree) {
ompi_coll_base_topo_destroy_tree(&item->tree);
}
}
OBJ_CLASS_INSTANCE(adapt_topology_cache_item_t, opal_list_item_t,
NULL, &destruct_topology_cache);
static ompi_coll_tree_t *create_topology(
ompi_coll_adapt_algorithm_t algorithm,
int root,
struct ompi_communicator_t *comm)
{
switch(algorithm) {
case OMPI_COLL_ADAPT_ALGORITHM_TUNED:
{
return NULL;
}
case OMPI_COLL_ADAPT_ALGORITHM_BINOMIAL:
{
return ompi_coll_base_topo_build_bmtree(comm, root);
}
case OMPI_COLL_ADAPT_ALGORITHM_IN_ORDER_BINOMIAL:
{
return ompi_coll_base_topo_build_in_order_bmtree(comm, root);
}
case OMPI_COLL_ADAPT_ALGORITHM_BINARY:
{
return ompi_coll_base_topo_build_tree(2, comm, root);
}
case OMPI_COLL_ADAPT_ALGORITHM_PIPELINE:
{
return ompi_coll_base_topo_build_chain(1, comm, root);
}
case OMPI_COLL_ADAPT_ALGORITHM_CHAIN:
{
return ompi_coll_base_topo_build_chain(4, comm, root);
}
case OMPI_COLL_ADAPT_ALGORITHM_LINEAR:
{
int fanout = ompi_comm_size(comm) - 1;
ompi_coll_tree_t *tree;
if (fanout < 1) {
tree = ompi_coll_base_topo_build_chain(1, comm, root);
} else if (fanout <= MAXTREEFANOUT) {
tree = ompi_coll_base_topo_build_tree(ompi_comm_size(comm) - 1, comm, root);
} else {
tree = ompi_coll_base_topo_build_tree(MAXTREEFANOUT, comm, root);
}
return tree;
}
default:
printf("WARN: unknown topology %d\n", algorithm);
return NULL;
}
}
ompi_coll_tree_t* adapt_module_cached_topology(
mca_coll_base_module_t *module,
struct ompi_communicator_t *comm,
int root,
ompi_coll_adapt_algorithm_t algorithm)
{
mca_coll_adapt_module_t *adapt_module = (mca_coll_adapt_module_t*)module;
adapt_topology_cache_item_t *item;
ompi_coll_tree_t * tree;
if (NULL != adapt_module->topo_cache) {
OPAL_LIST_FOREACH(item, adapt_module->topo_cache, adapt_topology_cache_item_t) {
if (item->root == root && item->algorithm == algorithm) {
return item->tree;
}
}
} else {
adapt_module->topo_cache = OBJ_NEW(opal_list_t);
}
/* topology not found, create one */
tree = create_topology(algorithm, root, comm);
item = OBJ_NEW(adapt_topology_cache_item_t);
item->tree = tree;
item->root = root;
item->algorithm = algorithm;
opal_list_prepend(adapt_module->topo_cache, &item->super);
return tree;
}

35
ompi/mca/coll/adapt/coll_adapt_topocache.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2014-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_ADAPT_TOPOCACHE_H
#define MCA_COLL_ADAPT_TOPOCACHE_H
#include "opal/class/opal_list.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
typedef struct adapt_topology_cache_item_t {
opal_list_item_t super;
ompi_coll_tree_t *tree;
int root;
int algorithm;
} adapt_topology_cache_item_t;
OBJ_CLASS_DECLARATION(adapt_topology_cache_item_t);
OMPI_DECLSPEC ompi_coll_tree_t* adapt_module_cached_topology(
mca_coll_base_module_t *module,
struct ompi_communicator_t *comm,
int root,
ompi_coll_adapt_algorithm_t algorithm);
#endif /* MCA_COLL_ADAPT_TOPOCACHE_H */

Просмотреть файл

@ -34,6 +34,10 @@
BEGIN_C_DECLS
/**
* Request structure to be returned by non-blocking
* collective operations.
*/
struct ompi_coll_base_nbc_request_t {
ompi_request_t super;
union {
@ -133,14 +137,29 @@ unsigned int ompi_mirror_perm(unsigned int x, int nbits);
*/
int ompi_rounddown(int num, int factor);
/**
* If necessary, retain op and store it in the
* request object, which should be of type ompi_coll_base_nbc_request_t
* (will be cast internally).
*/
int ompi_coll_base_retain_op( ompi_request_t *request,
ompi_op_t *op,
ompi_datatype_t *type);
/**
* If necessary, retain the datatypes and store them in the
* request object, which should be of type ompi_coll_base_nbc_request_t
* (will be cast internally).
*/
int ompi_coll_base_retain_datatypes( ompi_request_t *request,
ompi_datatype_t *stype,
ompi_datatype_t *rtype);
/**
* If necessary, retain the datatypes and store them in the
* request object, which should be of type ompi_coll_base_nbc_request_t
* (will be cast internally).
*/
int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
ompi_datatype_t * const stypes[],
ompi_datatype_t * const rtypes[]);