/* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef MCA_BCOL_IBOFFLOAD_COLLREQ_H #define MCA_BCOL_IBOFFLOAD_COLLREQ_H #include "ompi_config.h" #include #include #include #include "opal/class/ompi_free_list.h" #include "bcol_iboffload.h" #include "bcol_iboffload_device.h" #include "bcol_iboffload_collfrag.h" #define SBUF 0 #define RBUF 1 #define BCOL_IBOFFLOAD_BUFFERS 2 BEGIN_C_DECLS struct mca_bcol_iboffload_reg_t; /* * collective progress function */ typedef int (*collective_message_progress_function)( struct mca_bcol_iboffload_module_t *iboffload, struct mca_bcol_iboffload_collreq_t *full_message_descriptor); /* * callback function to be called after the collective work request * completes. This is invoked in user-space, and is typically where * data may be copied out of library buffers, or when any other user- * level protocol may be completed * * input: * callback data: typically, this may be the work request just finished */ typedef int (*collective_message_completion_callback_function)( void *callback_data); struct mca_bcol_iboffload_buff_info { void *buf; size_t offset; uint32_t lkey; struct mca_bcol_iboffload_reg_t *iboffload_reg; }; typedef struct mca_bcol_iboffload_buff_info mca_bcol_iboffload_buff_info; /* * Collective message descriptor * the mca_bcol_iboffload_message_desc_t was replaced with mca_bcol_iboffload_collreq_t * ************************************************************************************************* * * Brief description of iboffload collective request dependencies: * * mca_bcol_iboffload_collreq_t <----<< Full coll request * | * --(0)-- mca_bcol_iboffload_collfrag_t <----<< Fragment of coll request ( for example * | | 10MB Bcast maybe split to 2MB fragments ) * | | * | --(0)-- mca_bcol_iboffload_task_t---mqe_task * | | | * | | ---mca_bcol_iboffload_frag_t---ibv_sge * | --(1)-- mca_bcol_iboffload_task_t---mqe_task * | | | * | | ---mca_bcol_iboffload_frag_t---ibv_sge * | ..(M).. * | * --(1)-- mca_bcol_iboffload_collfrag_t * | * ..(N).. * * ************************************************************************************************* */ struct mca_bcol_iboffload_collreq_t { ompi_request_t super; /* op type */ struct ompi_op_t *op; /* Sometimes the operation that should be performed by the IB is different than the mpi_op and is then set by the pack_data_for_calc function */ enum ibv_m_wr_calc_op actual_ib_op; /* Sometimes the data type that should be used by the IB to peroform the calc s different than the mpi dtype, and is then set by the pack_data_for_calc function */ enum ibv_m_wr_data_type actual_ib_dtype; /* data type */ struct ompi_datatype_t *dtype; /* convertor for send operation */ opal_convertor_t send_conv; /* convertor for recv operation */ opal_convertor_t recv_conv; /* * count (in data type units) */ uint64_t count; /* * root of collective operation */ int root; /* number of message fragments */ int n_fragments; /* number of fragments sent - all resrouces for a fragment are allocated * or none at all are */ int n_frags_sent; /* number of fragments completed from the MPI perspective */ int n_frag_mpi_complete; /* number of fragments completed from a network perspective */ int n_frag_net_complete; /* collective free and may be released - message complete from the ** MPI perspective, the network prespective, and the user is done ** with the message handle */ volatile bool user_handle_freed; /* list of collective fragements - only 1 for now */ opal_list_t work_requests; /* message progress function */ collective_message_progress_function progress_fn; /* work request completion callback function */ collective_message_completion_callback_function completion_cb_fn; /* index of qp with enough length of buffs for this collective */ int qp_index; bool if_bcol_last; /* The flag is used for the last bcol to indicate if the calculation should be done by the cpu */ bool do_calc_in_cpu; /* in Allreduce case, if (true == do_calc_in_cpu) => the final result will be calc on local CPU */ uint64_t l_operand; uint64_t r_operand; /* caching ML-rdma buffer descriptor */ mca_bcol_iboffload_rdma_buffer_desc_t *ml_rdma_desc; /* ML buffer index code */ int ml_buffer_index; /* In the current implementation the collrequest connected to 1 single iboffload module */ struct mca_bcol_iboffload_module_t *module; mca_bcol_iboffload_collfrag_t first_collfrag; /* Send/recv buffs info - user buffers registration if needed etc. */ mca_bcol_iboffload_buff_info buffer_info[BCOL_IBOFFLOAD_BUFFERS]; /* My bi nominal tree children in this collective */ int *bi_nominal_tree_children; /* Convertors for send/recv if needed */ opal_convertor_t send_convertor; opal_convertor_t recv_convertor; /* Order info from upper layer */ mca_bcol_base_order_info_t *order_info; }; typedef struct mca_bcol_iboffload_collreq_t mca_bcol_iboffload_collreq_t; OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collreq_t); #define COLLREQ_IS_DONE(cr) (cr->user_handle_freed && \ (cr->n_frag_mpi_complete == cr->n_fragments) && \ (cr->n_frag_net_complete == cr->n_fragments)) #define RELEASE_COLLREQ(cr) \ do { \ (cr)->user_handle_freed = false; \ OMPI_FREE_LIST_RETURN_MT(&mca_bcol_iboffload_component.collreqs_free, \ (ompi_free_list_item_t *) (cr)); \ } while (0) static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_free_resources_and_move_to_pending( mca_bcol_iboffload_collfrag_t *coll_fragment, mca_bcol_iboffload_module_t *iboffload) { int rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_fragment, iboffload->device->frags_free); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { return rc; } IBOFFLOAD_VERBOSE(10, ("iboffload - %p, coll_fragment - %p, " "coll frag in_pending_list ? - %d, pending_list size - %d.\n", iboffload, coll_fragment, coll_fragment->in_pending_list, opal_list_get_size(&iboffload->collfrag_pending))); BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(iboffload, coll_fragment->mq_index, coll_fragment->mq_credits); /* Remove coll frag from coll request opal_list */ opal_list_remove_item(&coll_fragment->coll_full_req->work_requests, (opal_list_item_t *) coll_fragment); if (false == coll_fragment->in_pending_list) { /* Put the collfrag on pending list */ coll_fragment->in_pending_list = true; opal_list_append(&iboffload->collfrag_pending, (opal_list_item_t *) coll_fragment); } else { /* The item is already on pending list => insert it first that not break order between frags on the list */ opal_list_prepend(&iboffload->collfrag_pending, (opal_list_item_t *) coll_fragment); } return OMPI_SUCCESS; } /* Forward declaration */ struct mca_bcol_iboffload_reg_t; static inline __opal_attribute_always_inline__ int mca_bcol_iboffload_prepare_buffer( void *buffer, size_t size, struct mca_bcol_iboffload_reg_t **registration_handler, mca_bcol_iboffload_module_t *iboffload) { int rc; mca_mpool_base_registration_t *reg = NULL; assert(size > 0); rc = iboffload->device->mpool->mpool_register( iboffload->device->mpool, buffer, size, (uint32_t) 0 /* flags */, ®); *registration_handler = (struct mca_bcol_iboffload_reg_t *) reg; return rc; } int mca_bcol_iboffload_coll_req_implement( mca_bcol_iboffload_module_t *iboffload, mca_bcol_iboffload_collreq_t *coll_request); END_C_DECLS #endif