From dae3c9447cbde2b4ffd525a0e7bebcce36d48dd5 Mon Sep 17 00:00:00 2001 From: Thananon Patinyasakdikul Date: Thu, 7 Jun 2018 09:33:12 -0700 Subject: [PATCH] btl/ofi: add scalable endpoint support. This commit add support for scalable endpoint to enhance multithreaded application performance. The BTL will detect the support from ofi provider and will fallback to normal usage of scalable endpoint is not supported. NEW MCA parameters: - mca_btl_ofi_disable_sep: force the btl to not use scalable endpoint. - mca_btl_ofi_num_contexts_per_module: number of communication context to create (should be the same as number of thread). Signed-off-by: Thananon Patinyasakdikul --- opal/mca/btl/ofi/btl_ofi.h | 59 ++++- opal/mca/btl/ofi/btl_ofi_atomics.c | 25 +- opal/mca/btl/ofi/btl_ofi_component.c | 348 ++++++++++++++++----------- opal/mca/btl/ofi/btl_ofi_endpoint.c | 291 ++++++++++++++++++++++ opal/mca/btl/ofi/btl_ofi_endpoint.h | 22 ++ opal/mca/btl/ofi/btl_ofi_module.c | 8 +- opal/mca/btl/ofi/btl_ofi_rdma.c | 24 +- opal/mca/btl/ofi/btl_ofi_rdma.h | 1 + 8 files changed, 610 insertions(+), 168 deletions(-) diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h index ca96e415c7..83107ef9b6 100644 --- a/opal/mca/btl/ofi/btl_ofi.h +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -46,9 +46,7 @@ #include BEGIN_C_DECLS - -#define MCA_BTL_OFI_MAX_MODULES 16 -#define MCA_BTL_OFI_MAX_WORKERS 1 +#define MCA_BTL_OFI_MAX_MODULES 16 #define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128 #define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args) @@ -62,6 +60,26 @@ enum mca_btl_ofi_type { MCA_BTL_OFI_TYPE_TOTAL }; +struct mca_btl_ofi_context_t { + int32_t context_id; + + /* transmit context */ + struct fid_ep *tx_ctx; + struct fid_ep *rx_ctx; + + /* completion queue */ + struct fid_cq *cq; + + /* completion info freelist */ + /* We have it per context to reduce the thread contention + * on the freelist. Things can get really slow. */ + opal_free_list_t comp_list; + + /* for thread locking */ + volatile int32_t lock; +}; +typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t; + /** * @brief OFI BTL module */ @@ -74,17 +92,17 @@ struct mca_btl_ofi_module_t { struct fid_fabric *fabric; struct fid_domain *domain; struct fid_ep *ofi_endpoint; - struct fid_cq *cq; struct fid_av *av; + int num_contexts; + mca_btl_ofi_context_t *contexts; + char *linux_device_name; /** whether the module has been fully initialized or not */ bool initialized; bool use_virt_addr; - - /** spin-lock to protect the module */ - volatile int32_t lock; + bool is_scalable_ep; int64_t outstanding_rdma; @@ -92,8 +110,7 @@ struct mca_btl_ofi_module_t { * there is no need for a complicated structure here at this time*/ opal_list_t endpoints; - /* free lists */ - opal_free_list_t comp_list; + opal_mutex_t module_lock; /** registration cache */ mca_rcache_base_module_t *rcache; @@ -110,6 +127,7 @@ struct mca_btl_ofi_component_t { /** number of TL modules */ int module_count; + int num_contexts_per_module; int num_cqe_read; size_t namelen; @@ -117,10 +135,6 @@ struct mca_btl_ofi_component_t { /** All BTL OFI modules (1 per tl) */ mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES]; -#if OPAL_C_HAVE__THREAD_LOCAL - /** bind threads to contexts */ - bool bind_threads_to_contexts; -#endif }; typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t; @@ -151,6 +165,7 @@ struct mca_btl_ofi_completion_t { struct mca_btl_base_module_t *btl; struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_ofi_context_t *my_context; uint32_t type; void *local_address; @@ -269,7 +284,25 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg); int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); +int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context); void mca_btl_ofi_exit(void); +/* thread atomics */ +static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context) +{ + return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1)); +} + +static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context) +{ + while (mca_btl_ofi_context_trylock(context)); +} + +static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context) +{ + opal_atomic_mb(); + context->lock = 0; +} + END_C_DECLS #endif diff --git a/opal/mca/btl/ofi/btl_ofi_atomics.c b/opal/mca/btl/ofi/btl_ofi_atomics.c index 7d83d5c2b2..d6b4da333e 100644 --- a/opal/mca/btl/ofi/btl_ofi_atomics.c +++ b/opal/mca/btl/ofi/btl_ofi_atomics.c @@ -43,6 +43,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { fi_datatype = FI_UINT32; @@ -51,6 +54,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end fi_op = to_fi_op(op); comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -61,7 +65,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end remote_address = (remote_address - (uint64_t) remote_handle->base_addr); - rc = fi_fetch_atomic(ofi_btl->ofi_endpoint, + rc = fi_fetch_atomic(ofi_context->tx_ctx, (void*) &comp->operand, 1, NULL, /* operand */ local_address, local_handle->desc, /* results */ btl_endpoint->peer_addr, /* remote addr */ @@ -77,6 +81,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + /* force a bit of progress. */ + mca_btl_ofi_component.super.btl_progress(); + return OPAL_SUCCESS; } @@ -92,6 +99,9 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { fi_datatype = FI_UINT32; @@ -100,6 +110,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t fi_op = to_fi_op(op); comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, NULL, NULL, cbfunc, cbcontext, cbdata, @@ -110,7 +121,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t remote_address = (remote_address - (uint64_t) remote_handle->base_addr); - rc = fi_atomic(ofi_btl->ofi_endpoint, + rc = fi_atomic(ofi_context->tx_ctx, (void*) &comp->operand, 1, NULL, /* operand */ btl_endpoint->peer_addr, /* remote addr */ remote_address, remote_handle->rkey, /* remote buffer */ @@ -124,6 +135,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t } MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + mca_btl_ofi_component.super.btl_progress(); return OPAL_SUCCESS; } @@ -139,12 +151,16 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { fi_datatype = FI_UINT32; } comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -157,7 +173,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e remote_address = (remote_address - (uint64_t) remote_handle->base_addr); /* perform atomic */ - rc = fi_compare_atomic(ofi_btl->ofi_endpoint, + rc = fi_compare_atomic(ofi_context->tx_ctx, (void*) &comp->operand, 1, NULL, (void*) &comp->compare, NULL, local_address, local_handle->desc, @@ -176,5 +192,8 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + /* force a bit of progress. */ + mca_btl_ofi_component.super.btl_progress(); + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 840d0dfdb8..50ad897ff2 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -31,15 +31,16 @@ #include #include "btl_ofi.h" +#include "btl_ofi_endpoint.h" #include "btl_ofi_rdma.h" - #define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC) #define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) static char *prov_include; static char *prov_exclude; static char *ofi_progress_mode; +static bool disable_sep; static int mca_btl_ofi_init_device(struct fi_info *info); /* validate information returned from fi_getinfo(). @@ -124,16 +125,24 @@ static int mca_btl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ofi_progress_mode); -#if OPAL_C_HAVE__THREAD_LOCAL - mca_btl_ofi_component.bind_threads_to_contexts = true; + mca_btl_ofi_component.num_contexts_per_module = 1; (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, - "bind_threads_to_contexts", "Bind threads to device contexts. " - "In general this should improve the multi-threaded performance " - "when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL, - NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_ALL, - &mca_btl_ofi_component.bind_threads_to_contexts); -#endif + "num_contexts_per_module", + "number of communication context per module to create. " + "This should increase multithreaded performance but it is " + "advised that this number should be lower than total cores.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.num_contexts_per_module); + disable_sep = false; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "disable_sep", + "force btl/ofi to never use scalable endpoint. ", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &disable_sep); /* for now we want this component to lose to btl/ugni and btl/vader */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; @@ -148,7 +157,6 @@ static int mca_btl_ofi_component_open(void) return OPAL_SUCCESS; } - /* * component cleanup - sanity checking of queue lengths */ @@ -289,21 +297,34 @@ static int mca_btl_ofi_init_device(struct fi_info *info) int rc; int *module_count = &mca_btl_ofi_component.module_count; size_t namelen; - mca_btl_ofi_module_t *module; + size_t num_contexts_to_create; char *linux_device_name; char ep_name[FI_NAME_MAX]; + struct fi_info *ofi_info; - struct fi_cq_attr cq_attr = {0}; + struct fi_ep_attr *ep_attr; + struct fi_domain_attr *domain_attr; struct fi_av_attr av_attr = {0}; struct fid_fabric *fabric = NULL; struct fid_domain *domain = NULL; - struct fid_ep *endpoint = NULL; - struct fid_cq *cq = NULL; + struct fid_ep *ep = NULL; struct fid_av *av = NULL; + mca_btl_ofi_module_t *module; + + /* allocate module */ + module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); + if (NULL == module) { + BTL_ERROR(("failed to allocate memory for OFI module")); + goto fail; + } + *module = mca_btl_ofi_module_template; + /* make a copy of the given info to store on the module */ ofi_info = fi_dupinfo(info); + ep_attr = ofi_info->ep_attr; + domain_attr = ofi_info->domain_attr; linux_device_name = info->domain_attr->name; BTL_VERBOSE(("initializing dev:%s provider:%s", @@ -330,28 +351,6 @@ static int mca_btl_ofi_init_device(struct fi_info *info) goto fail; } - /* endpoint */ - rc = fi_endpoint(domain, ofi_info, &endpoint, NULL); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_endpoint with err=%s", - linux_device_name, - fi_strerror(-rc) - )); - goto fail; - } - - /* CQ */ - cq_attr.format = FI_CQ_FORMAT_CONTEXT; - cq_attr.wait_obj = FI_WAIT_NONE; - rc = fi_cq_open(domain, &cq_attr, &cq, NULL); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_cq_open with err=%s", - linux_device_name, - fi_strerror(-rc) - )); - goto fail; - } - /* AV */ av_attr.type = FI_AV_MAP; rc = fi_av_open(domain, &av_attr, &av, NULL); @@ -363,29 +362,76 @@ static int mca_btl_ofi_init_device(struct fi_info *info) goto fail; } + num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module; - /* bind CQ and AV to endpoint */ - uint32_t cq_flags = (FI_TRANSMIT); - rc = fi_ep_bind(endpoint, (fid_t)cq, cq_flags); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", - linux_device_name, - fi_strerror(-rc) - )); - goto fail; + /* If the domain support scalable endpoint. */ + if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) { + + BTL_VERBOSE(("btl/ofi using scalable endpoint.")); + + if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) { + BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)", + module->num_contexts, + domain_attr->max_ep_tx_ctx)); + goto fail; + } + + /* modify the info to let the provider know we are creating x contexts */ + ep_attr->tx_ctx_cnt = num_contexts_to_create; + ep_attr->rx_ctx_cnt = num_contexts_to_create; + + /* create scalable endpoint */ + rc = fi_scalable_ep(domain, ofi_info, &ep, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + module->num_contexts = num_contexts_to_create; + module->is_scalable_ep = true; + + /* create contexts */ + module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info, + domain, ep, av, + num_contexts_to_create); + + } else { + /* warn the user if they want more than 1 context */ + if (num_contexts_to_create > 1) { + BTL_ERROR(("cannot create %zu contexts as the provider does not support " + "scalable endpoint. Falling back to single context endpoint.", + num_contexts_to_create)); + } + + BTL_VERBOSE(("btl/ofi using normal endpoint.")); + + rc = fi_endpoint(domain, ofi_info, &ep, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_endpoint with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + module->num_contexts = 1; + module->is_scalable_ep = false; + + /* create contexts */ + module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info, + domain, ep, av); } - rc = fi_ep_bind(endpoint, (fid_t)av, 0); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", - linux_device_name, - fi_strerror(-rc) - )); + if (NULL == module->contexts) { + /* error message is already printed */ goto fail; } /* enable the endpoint for using */ - rc = fi_enable(endpoint); + rc = fi_enable(ep); if (0 != rc) { BTL_VERBOSE(("%s failed fi_enable with err=%s", linux_device_name, @@ -395,19 +441,12 @@ static int mca_btl_ofi_init_device(struct fi_info *info) } /* Everything succeeded, lets create a module for this device. */ - module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); - if (NULL == module) { - goto fail; - } - *module = mca_btl_ofi_module_template; - /* store the information. */ module->fabric_info = ofi_info; module->fabric = fabric; module->domain = domain; - module->cq = cq; module->av = av; - module->ofi_endpoint = endpoint; + module->ofi_endpoint = ep; module->linux_device_name = linux_device_name; module->outstanding_rdma = 0; module->use_virt_addr = false; @@ -420,29 +459,13 @@ static int mca_btl_ofi_init_device(struct fi_info *info) /* initialize the rcache */ mca_btl_ofi_rcache_init(module); + /* create endpoint list */ OBJ_CONSTRUCT(&module->endpoints, opal_list_t); - - /* init free lists */ - OBJ_CONSTRUCT(&module->comp_list, opal_free_list_t); - rc = opal_free_list_init(&module->comp_list, - sizeof(mca_btl_ofi_completion_t), - opal_cache_line_size, - OBJ_CLASS(mca_btl_ofi_completion_t), - 0, - 0, - 128, - -1, - 128, - NULL, - 0, - NULL, - NULL, - NULL); - assert(OPAL_SUCCESS == rc); + OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t); /* create and send the modex for this device */ namelen = sizeof(ep_name); - rc = fi_getname((fid_t)endpoint, &ep_name[0], &namelen); + rc = fi_getname((fid_t)ep, &ep_name[0], &namelen); if (0 != rc) { BTL_VERBOSE(("%s failed fi_getname with err=%s", linux_device_name, @@ -466,15 +489,20 @@ static int mca_btl_ofi_init_device(struct fi_info *info) fail: /* clean up */ + + /* if the contexts have not been initiated, num_contexts should + * be zero and we skip this. */ + for (int i=0; i < module->num_contexts; i++) { + mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep); + } + free(module->contexts); + if (NULL != av) { fi_close(&av->fid); } - if (NULL != cq) { - fi_close(&cq->fid); - } - if (NULL != endpoint) { - fi_close(&endpoint->fid); + if (NULL != ep) { + fi_close(&ep->fid); } if (NULL != domain) { @@ -484,12 +512,12 @@ fail: if (NULL != fabric) { fi_close(&fabric->fid); } + free(module); /* not really a failure. just skip this device. */ return OPAL_ERR_OUT_OF_RESOURCE; } - /** * @brief OFI BTL progress function * @@ -497,6 +525,44 @@ fail: */ static int mca_btl_ofi_component_progress (void) { + int events = 0; + mca_btl_ofi_context_t *context; + + for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { + mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i]; + + /* progress context we own first. */ + context = get_ofi_context(module); + + if (mca_btl_ofi_context_trylock(context)) { + events += mca_btl_ofi_context_progress(context); + mca_btl_ofi_context_unlock(context); + } + + /* if there is nothing to do, try progress other's. */ + if (events == 0) { + for (int j = 0 ; j < module->num_contexts ; j++ ) { + + context = get_ofi_context_rr(module); + + if (mca_btl_ofi_context_trylock(context)) { + events += mca_btl_ofi_context_progress(context); + mca_btl_ofi_context_unlock(context); + } + + /* If we did something, good enough. return now. + * This is crucial for performance/latency. */ + if (events > 0) { + break; + } + } + } + } + + return events; +} + +int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { int ret = 0; int events_read; @@ -506,72 +572,66 @@ static int mca_btl_ofi_component_progress (void) mca_btl_ofi_completion_t *comp; - for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { - mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i]; + ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); - ret = fi_cq_read(module->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); + if (0 < ret) { + events_read = ret; + for (int i = 0; i < events_read; i++) { + if (NULL != cq_entry[i].op_context) { + ++events; + comp = (mca_btl_ofi_completion_t*) cq_entry[i].op_context; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl; - if (0 < ret) { - events_read = ret; - for (int j = 0; j < events_read; j++) { - if (NULL != cq_entry[j].op_context) { - ++events; - comp = (mca_btl_ofi_completion_t*) cq_entry[j].op_context; - mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl; + switch (comp->type) { + case MCA_BTL_OFI_TYPE_GET: + case MCA_BTL_OFI_TYPE_PUT: + case MCA_BTL_OFI_TYPE_AOP: + case MCA_BTL_OFI_TYPE_AFOP: + case MCA_BTL_OFI_TYPE_CSWAP: - switch (comp->type) { - case MCA_BTL_OFI_TYPE_GET: - case MCA_BTL_OFI_TYPE_PUT: - case MCA_BTL_OFI_TYPE_AOP: - case MCA_BTL_OFI_TYPE_AFOP: - case MCA_BTL_OFI_TYPE_CSWAP: - - /* call the callback */ - if (comp->cbfunc) { - comp->cbfunc (comp->btl, comp->endpoint, - comp->local_address, comp->local_handle, - comp->cbcontext, comp->cbdata, OPAL_SUCCESS); - } - - /* return the completion handler */ - opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); - - MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl); - break; - - default: - /* catasthrophic */ - BTL_ERROR(("unknown completion type")); - MCA_BTL_OFI_ABORT(); + /* call the callback */ + if (comp->cbfunc) { + comp->cbfunc (comp->btl, comp->endpoint, + comp->local_address, comp->local_handle, + comp->cbcontext, comp->cbdata, OPAL_SUCCESS); } + + /* return the completion handler */ + opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); + + MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl); + break; + + default: + /* catasthrophic */ + BTL_ERROR(("unknown completion type")); + MCA_BTL_OFI_ABORT(); } } - } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { - ret = fi_cq_readerr(module->cq, &cqerr, 0); - - /* cq readerr failed!? */ - if (0 > ret) { - BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", - __FILE__, __LINE__, fi_strerror(-ret), ret)); - } else { - BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", - cqerr.prov_errno)); - } - - MCA_BTL_OFI_ABORT(); - } + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { + ret = fi_cq_readerr(context->cq, &cqerr, 0); + + /* cq readerr failed!? */ + if (0 > ret) { + BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret)); + } else { + BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", + cqerr.prov_errno)); + } + MCA_BTL_OFI_ABORT(); + } #ifdef FI_EINTR - /* sometimes, sockets provider complain about interupt. */ - else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { - continue; - } + /* sometimes, sockets provider complain about interupt. We do nothing. */ + else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { + + } #endif - /* If the error is not FI_EAGAIN, report the error and abort. */ - else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { - BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); - MCA_BTL_OFI_ABORT(); - } + /* If the error is not FI_EAGAIN, report the error and abort. */ + else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { + BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); + MCA_BTL_OFI_ABORT(); } return events; @@ -593,5 +653,5 @@ mca_btl_ofi_component_t mca_btl_ofi_component = { .btl_init = mca_btl_ofi_component_init, .btl_progress = mca_btl_ofi_component_progress, - } + }, }; diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.c b/opal/mca/btl/ofi/btl_ofi_endpoint.c index 871d594ddf..f9552bf106 100644 --- a/opal/mca/btl/ofi/btl_ofi_endpoint.c +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.c @@ -15,6 +15,10 @@ #include "btl_ofi_endpoint.h" #include "opal/util/proc.h" +#if OPAL_HAVE_THREAD_LOCAL +opal_thread_local mca_btl_ofi_context_t *my_context = NULL; +#endif /* OPAL_HAVE_THREAD_LOCAL */ + static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint) { endpoint->peer_addr = 0; @@ -49,3 +53,290 @@ mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct return (mca_btl_base_endpoint_t *) endpoint; } +int ofi_comp_list_init(opal_free_list_t *comp_list) +{ + int rc; + OBJ_CONSTRUCT(comp_list, opal_free_list_t); + rc = opal_free_list_init(comp_list, + sizeof(mca_btl_ofi_completion_t), + opal_cache_line_size, + OBJ_CLASS(mca_btl_ofi_completion_t), + 0, + 0, + 128, + -1, + 128, + NULL, + 0, + NULL, + NULL, + NULL); + if (rc != OPAL_SUCCESS) { + BTL_VERBOSE(("cannot allocate completion freelist")); + } + return rc; +} + +/* mca_btl_ofi_context_alloc_normal() + * + * This function will allocate an ofi_context, map the endpoint to tx/rx context, + * bind CQ,AV to the endpoint and initialize all the structure. + * USE WITH NORMAL ENDPOINT ONLY */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *ep, + struct fid_av *av) +{ + int rc; + uint32_t cq_flags = FI_TRANSMIT; + char *linux_device_name = info->domain_attr->name; + + struct fi_cq_attr cq_attr = {0}; + + mca_btl_ofi_context_t *context; + + context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context)); + if (NULL == context) { + BTL_VERBOSE(("cannot allocate context")); + return NULL; + } + + /* Don't really need to check, just avoiding compiler warning because + * BTL_VERBOSE is a no op in performance build and the compiler will + * complain about unused variable. */ + if (NULL == linux_device_name) { + BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); + goto single_fail; + } + + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = fi_ep_bind(ep, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = ofi_comp_list_init(&context->comp_list); + if (rc != OPAL_SUCCESS) { + goto single_fail; + } + + context->tx_ctx = ep; + context->rx_ctx = ep; + context->context_id = 0; + + return context; + +single_fail: + mca_btl_ofi_context_finalize(context, false); + return NULL; +} + +/* mca_btl_ofi_context_alloc_scalable() + * + * This function allocate communication contexts and return the pointer + * to the first btl context. It also take care of all the bindings needed. + * USE WITH SCALABLE ENDPOINT ONLY */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *sep, + struct fid_av *av, + size_t num_contexts) +{ + BTL_VERBOSE(("creating %zu contexts", num_contexts)); + + int rc; + size_t i; + char *linux_device_name = info->domain_attr->name; + + struct fi_cq_attr cq_attr = {0}; + struct fi_tx_attr tx_attr = {0}; + struct fi_rx_attr rx_attr = {0}; + + mca_btl_ofi_context_t *contexts; + + contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts)); + if (NULL == contexts) { + BTL_VERBOSE(("cannot allocate communication contexts.")); + return NULL; + } + + /* Don't really need to check, just avoiding compiler warning because + * BTL_VERBOSE is a no op in performance build and the compiler will + * complain about unused variable. */ + if (NULL == linux_device_name) { + BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); + goto scalable_fail; + } + + /* bind AV to endpoint */ + rc = fi_scalable_ep_bind(sep, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + for (i=0; i < num_contexts; i++) { + rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_tx_context with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* We don't actually need a receiving context as we only do one-sided. + * However, sockets provider will hang if we dont have one. It is + * also nice to have equal number of tx/rx context. */ + rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_rx_context with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* create CQ */ + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* bind cq to transmit context */ + uint32_t cq_flags = (FI_TRANSMIT); + rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* enable the context. */ + rc = fi_enable(contexts[i].tx_ctx); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + rc = fi_enable(contexts[i].rx_ctx); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* initialize completion freelist. */ + rc = ofi_comp_list_init(&contexts[i].comp_list); + if (rc != OPAL_SUCCESS) { + goto scalable_fail; + } + + /* assign the id */ + contexts[i].context_id = i; + } + + return contexts; + +scalable_fail: + /* close and free */ + for(i=0; i < num_contexts; i++) { + mca_btl_ofi_context_finalize(&contexts[i], true); + } + free(contexts); + + return NULL; +} + +void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) { + + /* if it is a scalable ep, we have to close all contexts. */ + if (scalable_ep) { + if (NULL != context->tx_ctx) { + fi_close(&context->tx_ctx->fid); + } + + if (NULL != context->rx_ctx) { + fi_close(&context->rx_ctx->fid); + } + } + + if( NULL != context->cq) { + fi_close(&context->cq->fid); + } + + /* Can we destruct the object that hasn't been constructed? */ + OBJ_DESTRUCT(&context->comp_list); +} + +/* Get a context to use for communication. + * If TLS is supported, it will use the cached endpoint. + * If not, it will invoke the normal round-robin assignment. */ +mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl) +{ +#if OPAL_HAVE_THREAD_LOCAL + /* With TLS, we cache the context we use. */ + static volatile int64_t cur_num = 0; + + if (OPAL_UNLIKELY(my_context == NULL)) { + OPAL_THREAD_LOCK(&btl->module_lock); + + my_context = &btl->contexts[cur_num]; + cur_num = (cur_num + 1) %btl->num_contexts; + + OPAL_THREAD_UNLOCK(&btl->module_lock); + } + + assert (my_context); + return my_context; +#else + return get_ofi_context_rr(btl); +#endif +} + +/* return the context in a round-robin. */ +/* There is no need for atomics here as it might hurt the performance. */ +mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl) +{ + static volatile uint64_t rr_num = 0; + return &btl->contexts[rr_num++%btl->num_contexts]; +} diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.h b/opal/mca/btl/ofi/btl_ofi_endpoint.h index f131f72b1d..aad758d8c8 100644 --- a/opal/mca/btl/ofi/btl_ofi_endpoint.h +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.h @@ -30,6 +30,10 @@ BEGIN_C_DECLS +#if OPAL_HAVE_THREAD_LOCAL +extern opal_thread_local mca_btl_ofi_context_t *my_context; +#endif /* OPAL_HAVE_THREAD_LOCAL */ + struct mca_btl_base_endpoint_t { opal_list_item_t super; @@ -47,7 +51,25 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t; OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t); +int ofi_comp_list_init(opal_free_list_t *comp_list); + mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep); +/* contexts */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *sep, + struct fid_av *av, + size_t num_contexts); + +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *ep, + struct fid_av *av); +void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep); + +mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl); +mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl); + END_C_DECLS #endif diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index f8d6d6619c..df6ae1e2e1 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -242,14 +242,17 @@ int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) { + int i; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *endpoint, *next; assert(btl); - if (NULL != ofi_btl->cq) { - fi_close(&ofi_btl->cq->fid); + /* loop over all the contexts */ + for (i=0; i < ofi_btl->num_contexts; i++) { + mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep); } + free(ofi_btl->contexts); if (NULL != ofi_btl->av) { fi_close(&ofi_btl->av->fid); @@ -278,7 +281,6 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) } OBJ_DESTRUCT(&ofi_btl->endpoints); - OBJ_DESTRUCT(&ofi_btl->comp_list); if (ofi_btl->rcache) { mca_rcache_base_module_destroy (ofi_btl->rcache); diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.c b/opal/mca/btl/ofi/btl_ofi_rdma.c index 854301006b..8f8719ed68 100644 --- a/opal/mca/btl/ofi/btl_ofi_rdma.c +++ b/opal/mca/btl/ofi/btl_ofi_rdma.c @@ -21,26 +21,31 @@ OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t, mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_ofi_context_t *ofi_context, void *local_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata, int type) { - mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)btl; + assert(btl); + assert(endpoint); + assert(ofi_context); + mca_btl_ofi_completion_t *comp; - comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_btl->comp_list); + comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_context->comp_list); assert(comp); comp->btl = btl; comp->endpoint = endpoint; + comp->my_context = ofi_context; comp->local_address = local_address; comp->local_handle = local_handle; comp->cbfunc = cbfunc; comp->cbcontext = cbcontext; comp->cbdata = cbdata; - comp->my_list = &ofi_btl->comp_list; + comp->my_list = &ofi_context->comp_list; comp->type = type; return comp; @@ -53,12 +58,17 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi { int rc; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); /* create completion context */ comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -67,7 +77,7 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi remote_address = (remote_address - (uint64_t) remote_handle->base_addr); /* Remote write data across the wire */ - rc = fi_read(ofi_btl->ofi_endpoint, + rc = fi_read(ofi_context->tx_ctx, local_address, size, /* payload */ local_handle->desc, btl_endpoint->peer_addr, @@ -99,10 +109,14 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi int rc; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); /* create completion context */ mca_btl_ofi_completion_t *comp; comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -111,7 +125,7 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi remote_address = (remote_address - (uint64_t) remote_handle->base_addr); /* Remote write data across the wire */ - rc = fi_write(ofi_btl->ofi_endpoint, + rc = fi_write(ofi_context->tx_ctx, local_address, size, /* payload */ local_handle->desc, btl_endpoint->peer_addr, diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.h b/opal/mca/btl/ofi/btl_ofi_rdma.h index 19eab1e54c..343ada3796 100644 --- a/opal/mca/btl/ofi/btl_ofi_rdma.h +++ b/opal/mca/btl/ofi/btl_ofi_rdma.h @@ -22,6 +22,7 @@ mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_ofi_context_t *ofi_context, void *local_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_rdma_completion_fn_t cbfunc,