274 строки
8.9 KiB
C
274 строки
8.9 KiB
C
|
/*
|
||
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||
|
* $COPYRIGHT$
|
||
|
*
|
||
|
* Additional copyrights may follow
|
||
|
*
|
||
|
* $HEADER$
|
||
|
*/
|
||
|
|
||
|
#ifndef MCA_BCOL_IBOFFLOAD_COLLREQ_H
|
||
|
#define MCA_BCOL_IBOFFLOAD_COLLREQ_H
|
||
|
|
||
|
#include "ompi_config.h"
|
||
|
|
||
|
#include <infiniband/mqe.h>
|
||
|
#include <infiniband/verbs.h>
|
||
|
#include <infiniband/mverbs.h>
|
||
|
|
||
|
#include "ompi/class/ompi_free_list.h"
|
||
|
|
||
|
#include "bcol_iboffload.h"
|
||
|
#include "bcol_iboffload_device.h"
|
||
|
#include "bcol_iboffload_collfrag.h"
|
||
|
|
||
|
#define SBUF 0
|
||
|
#define RBUF 1
|
||
|
|
||
|
#define BCOL_IBOFFLOAD_BUFFERS 2
|
||
|
|
||
|
BEGIN_C_DECLS
|
||
|
|
||
|
struct mca_bcol_iboffload_reg_t;
|
||
|
|
||
|
/*
|
||
|
* collective progress function
|
||
|
*/
|
||
|
typedef int (*collective_message_progress_function)(
|
||
|
struct mca_bcol_iboffload_module_t *iboffload,
|
||
|
struct mca_bcol_iboffload_collreq_t *full_message_descriptor);
|
||
|
/*
|
||
|
* callback function to be called after the collective work request
|
||
|
* completes. This is invoked in user-space, and is typically where
|
||
|
* data may be copied out of library buffers, or when any other user-
|
||
|
* level protocol may be completed
|
||
|
*
|
||
|
* input:
|
||
|
* callback data: typically, this may be the work request just finished
|
||
|
*/
|
||
|
typedef int (*collective_message_completion_callback_function)(
|
||
|
void *callback_data);
|
||
|
|
||
|
struct mca_bcol_iboffload_buff_info {
|
||
|
void *buf;
|
||
|
size_t offset;
|
||
|
uint32_t lkey;
|
||
|
struct mca_bcol_iboffload_reg_t *iboffload_reg;
|
||
|
};
|
||
|
typedef struct mca_bcol_iboffload_buff_info mca_bcol_iboffload_buff_info;
|
||
|
|
||
|
/*
|
||
|
* Collective message descriptor
|
||
|
* the mca_bcol_iboffload_message_desc_t was replaced with mca_bcol_iboffload_collreq_t
|
||
|
* *************************************************************************************************
|
||
|
*
|
||
|
* Brief description of iboffload collective request dependencies:
|
||
|
*
|
||
|
* mca_bcol_iboffload_collreq_t <----<< Full coll request
|
||
|
* |
|
||
|
* --(0)-- mca_bcol_iboffload_collfrag_t <----<< Fragment of coll request ( for example
|
||
|
* | | 10MB Bcast maybe split to 2MB fragments )
|
||
|
* | |
|
||
|
* | --(0)-- mca_bcol_iboffload_task_t---mqe_task
|
||
|
* | | |
|
||
|
* | | ---mca_bcol_iboffload_frag_t---ibv_sge
|
||
|
* | --(1)-- mca_bcol_iboffload_task_t---mqe_task
|
||
|
* | | |
|
||
|
* | | ---mca_bcol_iboffload_frag_t---ibv_sge
|
||
|
* | ..(M)..
|
||
|
* |
|
||
|
* --(1)-- mca_bcol_iboffload_collfrag_t
|
||
|
* |
|
||
|
* ..(N)..
|
||
|
*
|
||
|
* *************************************************************************************************
|
||
|
*/
|
||
|
|
||
|
struct mca_bcol_iboffload_collreq_t {
|
||
|
ompi_request_t super;
|
||
|
|
||
|
/* op type */
|
||
|
struct ompi_op_t *op;
|
||
|
|
||
|
/* Sometimes the operation that should be performed
|
||
|
by the IB is different than the mpi_op and is then set
|
||
|
by the pack_data_for_calc function */
|
||
|
enum ibv_m_wr_calc_op actual_ib_op;
|
||
|
|
||
|
/* Sometimes the data type that should be used by the IB
|
||
|
to peroform the calc s different than the mpi dtype,
|
||
|
and is then set by the pack_data_for_calc function */
|
||
|
enum ibv_m_wr_data_type actual_ib_dtype;
|
||
|
|
||
|
/* data type */
|
||
|
struct ompi_datatype_t *dtype;
|
||
|
|
||
|
/* convertor for send operation */
|
||
|
opal_convertor_t send_conv;
|
||
|
|
||
|
/* convertor for recv operation */
|
||
|
opal_convertor_t recv_conv;
|
||
|
|
||
|
/*
|
||
|
* count (in data type units)
|
||
|
*/
|
||
|
uint64_t count;
|
||
|
|
||
|
/*
|
||
|
* root of collective operation
|
||
|
*/
|
||
|
int root;
|
||
|
|
||
|
/* number of message fragments */
|
||
|
int n_fragments;
|
||
|
|
||
|
/* number of fragments sent - all resrouces for a fragment are allocated
|
||
|
* or none at all are
|
||
|
*/
|
||
|
int n_frags_sent;
|
||
|
|
||
|
/* number of fragments completed from the MPI perspective */
|
||
|
int n_frag_mpi_complete;
|
||
|
|
||
|
/* number of fragments completed from a network perspective */
|
||
|
int n_frag_net_complete;
|
||
|
|
||
|
/* collective free and may be released - message complete from the
|
||
|
** MPI perspective, the network prespective, and the user is done
|
||
|
** with the message handle */
|
||
|
volatile bool user_handle_freed;
|
||
|
|
||
|
/* list of collective fragements - only 1 for now */
|
||
|
opal_list_t work_requests;
|
||
|
|
||
|
/* message progress function */
|
||
|
collective_message_progress_function progress_fn;
|
||
|
|
||
|
/* work request completion callback function */
|
||
|
collective_message_completion_callback_function completion_cb_fn;
|
||
|
|
||
|
/* index of qp with enough length of buffs for this collective */
|
||
|
int qp_index;
|
||
|
|
||
|
bool if_bcol_last;
|
||
|
|
||
|
/* The flag is used for the last bcol to indicate if the calculation should be done by the cpu */
|
||
|
bool do_calc_in_cpu;
|
||
|
|
||
|
/* in Allreduce case, if (true == do_calc_in_cpu) =>
|
||
|
the final result will be calc on local CPU */
|
||
|
uint64_t l_operand;
|
||
|
uint64_t r_operand;
|
||
|
|
||
|
/* caching ML-rdma buffer descriptor */
|
||
|
mca_bcol_iboffload_rdma_buffer_desc_t *ml_rdma_desc;
|
||
|
|
||
|
/* ML buffer index code */
|
||
|
int ml_buffer_index;
|
||
|
|
||
|
/* In the current implementation the collrequest connected to 1 single
|
||
|
iboffload module */
|
||
|
struct mca_bcol_iboffload_module_t *module;
|
||
|
|
||
|
mca_bcol_iboffload_collfrag_t first_collfrag;
|
||
|
|
||
|
/* Send/recv buffs info - user buffers registration if needed etc. */
|
||
|
mca_bcol_iboffload_buff_info buffer_info[BCOL_IBOFFLOAD_BUFFERS];
|
||
|
|
||
|
/* My bi nominal tree children in this collective */
|
||
|
int *bi_nominal_tree_children;
|
||
|
|
||
|
/* Convertors for send/recv if needed */
|
||
|
opal_convertor_t send_convertor;
|
||
|
opal_convertor_t recv_convertor;
|
||
|
|
||
|
/* Order info from upper layer */
|
||
|
mca_bcol_base_order_info_t *order_info;
|
||
|
};
|
||
|
typedef struct mca_bcol_iboffload_collreq_t mca_bcol_iboffload_collreq_t;
|
||
|
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collreq_t);
|
||
|
|
||
|
#define COLLREQ_IS_DONE(cr) (cr->user_handle_freed && \
|
||
|
(cr->n_frag_mpi_complete == cr->n_fragments) && \
|
||
|
(cr->n_frag_net_complete == cr->n_fragments))
|
||
|
|
||
|
#define RELEASE_COLLREQ(cr) \
|
||
|
do { \
|
||
|
(cr)->user_handle_freed = false; \
|
||
|
OMPI_FREE_LIST_RETURN(&mca_bcol_iboffload_component.collreqs_free, \
|
||
|
(ompi_free_list_item_t *) (cr)); \
|
||
|
} while (0)
|
||
|
|
||
|
static inline __opal_attribute_always_inline__
|
||
|
int mca_bcol_iboffload_free_resources_and_move_to_pending(
|
||
|
mca_bcol_iboffload_collfrag_t *coll_fragment,
|
||
|
mca_bcol_iboffload_module_t *iboffload)
|
||
|
{
|
||
|
int rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_fragment,
|
||
|
iboffload->device->frags_free);
|
||
|
|
||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
IBOFFLOAD_VERBOSE(10, ("iboffload - %p, coll_fragment - %p, "
|
||
|
"coll frag in_pending_list ? - %d, pending_list size - %d.\n",
|
||
|
iboffload, coll_fragment, coll_fragment->in_pending_list,
|
||
|
opal_list_get_size(&iboffload->collfrag_pending)));
|
||
|
|
||
|
BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(iboffload, coll_fragment->mq_index, coll_fragment->mq_credits);
|
||
|
|
||
|
/* Remove coll frag from coll request opal_list */
|
||
|
opal_list_remove_item(&coll_fragment->coll_full_req->work_requests,
|
||
|
(opal_list_item_t *) coll_fragment);
|
||
|
|
||
|
if (false == coll_fragment->in_pending_list) {
|
||
|
/* Put the collfrag on pending list */
|
||
|
coll_fragment->in_pending_list = true;
|
||
|
opal_list_append(&iboffload->collfrag_pending,
|
||
|
(opal_list_item_t *) coll_fragment);
|
||
|
} else {
|
||
|
/* The item is already on pending list =>
|
||
|
insert it first that not break order
|
||
|
between frags on the list */
|
||
|
opal_list_prepend(&iboffload->collfrag_pending,
|
||
|
(opal_list_item_t *) coll_fragment);
|
||
|
}
|
||
|
|
||
|
return OMPI_SUCCESS;
|
||
|
}
|
||
|
|
||
|
/* Forward declaration */
|
||
|
struct mca_bcol_iboffload_reg_t;
|
||
|
static inline __opal_attribute_always_inline__
|
||
|
int mca_bcol_iboffload_prepare_buffer(
|
||
|
void *buffer,
|
||
|
size_t size,
|
||
|
struct mca_bcol_iboffload_reg_t **registration_handler,
|
||
|
mca_bcol_iboffload_module_t *iboffload)
|
||
|
{
|
||
|
int rc;
|
||
|
mca_mpool_base_registration_t *reg = NULL;
|
||
|
|
||
|
assert(size > 0);
|
||
|
rc = iboffload->device->mpool->mpool_register(
|
||
|
iboffload->device->mpool,
|
||
|
buffer, size,
|
||
|
(uint32_t) 0 /* flags */,
|
||
|
®);
|
||
|
|
||
|
*registration_handler =
|
||
|
(struct mca_bcol_iboffload_reg_t *) reg;
|
||
|
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
int mca_bcol_iboffload_coll_req_implement(
|
||
|
mca_bcol_iboffload_module_t *iboffload,
|
||
|
mca_bcol_iboffload_collreq_t *coll_request);
|
||
|
|
||
|
END_C_DECLS
|
||
|
|
||
|
#endif
|