diff --git a/ompi/mca/mtl/ofi/README b/ompi/mca/mtl/ofi/README index efe1ca531f..7a8a6838a7 100644 --- a/ompi/mca/mtl/ofi/README +++ b/ompi/mca/mtl/ofi/README @@ -72,7 +72,7 @@ by reducing the bits available for the communicator ID field in the OFI tag. SCALABLE ENDPOINTS: ------------------- -OFI MTL supports OFI Scalable Endpoints feature as a means to improve +OFI MTL supports OFI Scalable Endpoints (SEP) feature as a means to improve multi-threaded application throughput and message rate. Currently the feature is designed to utilize multiple TX/RX contexts exposed by the OFI provider in conjunction with a multi-communicator MPI application model. Therefore, new OFI @@ -81,12 +81,13 @@ instead of creating them all at once during init time and this approach also favours only creating as many contexts as needed. 1. Multi-communicator model: - With this approach, the application first duplicates the communicators it - wants to use with MPI operations (ideally creating as many communicators as - the number of threads it wants to use to call into MPI). The duplicated - communicators are then used by the corresponding threads to perform MPI - operations. A possible usage scenario could be in an MPI + OMP - application as follows (example limited to 2 ranks): + With this approach, the MPI application is requried to first duplicate + the communicators it wants to use with MPI operations (ideally creating + as many communicators as the number of threads it wants to use to call + into MPI). The duplicated communicators are then used by the + corresponding threads to perform MPI operations. A possible usage + scenario could be in an MPI + OMP application as follows + (example limited to 2 ranks): MPI_Comm dup_comm[n]; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); @@ -112,13 +113,17 @@ favours only creating as many contexts as needed. } 2. MCA variables: -To utilize the feature, the following MCA variable needs to be set: + To utilize the feature, the following MCA variables need to be set: mtl_ofi_enable_sep: - This MCA variable needs to be set to enable the use of Scalable Endpoints + This MCA variable needs to be set to enable the use of Scalable Endpoints (SEP) feature in the OFI MTL. The underlying provider is also checked to ensure the feature is supported. If the provider chosen does not support it, user needs - to either set this variable to 0 or select different provider which supports + to either set this variable to 0 or select a different provider which supports the feature. + For single-threaded applications one OFI context is sufficient, so OFI SEPs + may not add benefit. + Note that mtl_ofi_thread_grouping (see below) needs to be enabled to use the + different OFI SEP contexts. Otherwise, only one context (ctxt 0) will be used. Default: 0 @@ -126,7 +131,12 @@ To utilize the feature, the following MCA variable needs to be set: "-mca mtl_ofi_enable_sep 1" mtl_ofi_thread_grouping: - This MCA variable needs to be set to switch Thread Grouping feature on. + Turn Thread Grouping feature on. This is needed to use the Multi-communicator + model explained above. This means that the OFI MTL will use the communicator + ID to decide the SEP contexts to be used by the thread. In this way, each + thread will have direct access to different OFI resources. If disabled, + only context 0 will be used. + Requires mtl_ofi_enable_sep to be set to 1. Default: 0 @@ -139,11 +149,11 @@ To utilize the feature, the following MCA variable needs to be set: "-mca mtl_ofi_thread_grouping 1" mtl_ofi_num_ctxts: - MCA variable allows user to set the number of OFI contexts the applications - expects to use. For multi-threaded applications using Thread Grouping - feature, this number should be set to the number of user threads that will - call into MPI. For single-threaded applications one OFI context is - sufficient. + This MCA variable allows user to set the number of OFI SEP contexts the + application expects to use. For multi-threaded applications using Thread + Grouping feature, this number should be set to the number of user threads + that will call into MPI. This variable will only have effect if + mtl_ofi_enable_sep is set to 1. Default: 1 diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 895bc44464..56167e8f0a 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -325,10 +325,18 @@ ompi_mtl_ofi_isend_callback(struct fi_cq_tagged_entry *wc, return OMPI_SUCCESS; } -#define MTL_OFI_MAP_COMM_TO_CONTEXT(comm_id, ctxt_id) \ - do { \ - ctxt_id = ompi_mtl_ofi.comm_to_context[comm_id]; \ - } while (0); +/* Return OFI context ID associated with the specific communicator */ +__opal_attribute_always_inline__ static inline int +ompi_mtl_ofi_map_comm_to_ctxt(uint32_t comm_id) +{ + /* For non-thread-grouping use case, only one context is used which is + * associated to MPI_COMM_WORLD, so use that. */ + if (0 == ompi_mtl_ofi.thread_grouping) { + comm_id = 0; + } + + return ompi_mtl_ofi.comm_to_context[comm_id]; +} __opal_attribute_always_inline__ static inline int ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req, @@ -342,7 +350,7 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req, ssize_t ret = OMPI_SUCCESS; int ctxt_id = 0; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); ack_req = malloc(sizeof(ompi_mtl_ofi_request_t)); @@ -397,7 +405,7 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl, fi_addr_t src_addr = 0; fi_addr_t sep_peer_fiaddr = 0; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); /** @@ -532,7 +540,7 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl, ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */ fi_addr_t sep_peer_fiaddr = 0; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); ofi_req->event_callback = ompi_mtl_ofi_isend_callback; @@ -617,7 +625,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, ompi_status_public_t *status = NULL; struct fi_msg_tagged tagged_msg; - MTL_OFI_MAP_COMM_TO_CONTEXT(ofi_req->comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(ofi_req->comm->c_contextid); assert(ofi_req->super.ompi_req); status = &ofi_req->super.ompi_req->req_status; @@ -758,7 +766,7 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl, size_t length; bool free_after; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); if (ofi_cq_data) { @@ -884,7 +892,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl, uint64_t msgflags = FI_CLAIM | FI_COMPLETION; struct ompi_communicator_t *comm = (*message)->comm; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); ompi_ret = ompi_mtl_datatype_recv_buf(convertor, @@ -977,7 +985,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl, uint64_t msgflags = FI_PEEK | FI_COMPLETION; int ctxt_id = 0; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); if (ofi_cq_data) { @@ -1066,7 +1074,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl, uint64_t msgflags = FI_PEEK | FI_CLAIM | FI_COMPLETION; int ctxt_id = 0; - MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); ofi_req = malloc(sizeof *ofi_req); @@ -1168,7 +1176,7 @@ ompi_mtl_ofi_cancel(struct mca_mtl_base_module_t *mtl, int ret, ctxt_id = 0; ompi_mtl_ofi_request_t *ofi_req = (ompi_mtl_ofi_request_t*) mtl_request; - MTL_OFI_MAP_COMM_TO_CONTEXT(ofi_req->comm->c_contextid, ctxt_id); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(ofi_req->comm->c_contextid); switch (ofi_req->type) { case OMPI_MTL_OFI_SEND: