diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h index ca96e415c7..83107ef9b6 100644 --- a/opal/mca/btl/ofi/btl_ofi.h +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -46,9 +46,7 @@ #include BEGIN_C_DECLS - -#define MCA_BTL_OFI_MAX_MODULES 16 -#define MCA_BTL_OFI_MAX_WORKERS 1 +#define MCA_BTL_OFI_MAX_MODULES 16 #define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128 #define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args) @@ -62,6 +60,26 @@ enum mca_btl_ofi_type { MCA_BTL_OFI_TYPE_TOTAL }; +struct mca_btl_ofi_context_t { + int32_t context_id; + + /* transmit context */ + struct fid_ep *tx_ctx; + struct fid_ep *rx_ctx; + + /* completion queue */ + struct fid_cq *cq; + + /* completion info freelist */ + /* We have it per context to reduce the thread contention + * on the freelist. Things can get really slow. */ + opal_free_list_t comp_list; + + /* for thread locking */ + volatile int32_t lock; +}; +typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t; + /** * @brief OFI BTL module */ @@ -74,17 +92,17 @@ struct mca_btl_ofi_module_t { struct fid_fabric *fabric; struct fid_domain *domain; struct fid_ep *ofi_endpoint; - struct fid_cq *cq; struct fid_av *av; + int num_contexts; + mca_btl_ofi_context_t *contexts; + char *linux_device_name; /** whether the module has been fully initialized or not */ bool initialized; bool use_virt_addr; - - /** spin-lock to protect the module */ - volatile int32_t lock; + bool is_scalable_ep; int64_t outstanding_rdma; @@ -92,8 +110,7 @@ struct mca_btl_ofi_module_t { * there is no need for a complicated structure here at this time*/ opal_list_t endpoints; - /* free lists */ - opal_free_list_t comp_list; + opal_mutex_t module_lock; /** registration cache */ mca_rcache_base_module_t *rcache; @@ -110,6 +127,7 @@ struct mca_btl_ofi_component_t { /** number of TL modules */ int module_count; + int num_contexts_per_module; int num_cqe_read; size_t namelen; @@ -117,10 +135,6 @@ struct mca_btl_ofi_component_t { /** All BTL OFI modules (1 per tl) */ mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES]; -#if OPAL_C_HAVE__THREAD_LOCAL - /** bind threads to contexts */ - bool bind_threads_to_contexts; -#endif }; typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t; @@ -151,6 +165,7 @@ struct mca_btl_ofi_completion_t { struct mca_btl_base_module_t *btl; struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_ofi_context_t *my_context; uint32_t type; void *local_address; @@ -269,7 +284,25 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg); int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); +int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context); void mca_btl_ofi_exit(void); +/* thread atomics */ +static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context) +{ + return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1)); +} + +static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context) +{ + while (mca_btl_ofi_context_trylock(context)); +} + +static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context) +{ + opal_atomic_mb(); + context->lock = 0; +} + END_C_DECLS #endif diff --git a/opal/mca/btl/ofi/btl_ofi_atomics.c b/opal/mca/btl/ofi/btl_ofi_atomics.c index 7d83d5c2b2..d6b4da333e 100644 --- a/opal/mca/btl/ofi/btl_ofi_atomics.c +++ b/opal/mca/btl/ofi/btl_ofi_atomics.c @@ -43,6 +43,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { fi_datatype = FI_UINT32; @@ -51,6 +54,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end fi_op = to_fi_op(op); comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -61,7 +65,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end remote_address = (remote_address - (uint64_t) remote_handle->base_addr); - rc = fi_fetch_atomic(ofi_btl->ofi_endpoint, + rc = fi_fetch_atomic(ofi_context->tx_ctx, (void*) &comp->operand, 1, NULL, /* operand */ local_address, local_handle->desc, /* results */ btl_endpoint->peer_addr, /* remote addr */ @@ -77,6 +81,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + /* force a bit of progress. */ + mca_btl_ofi_component.super.btl_progress(); + return OPAL_SUCCESS; } @@ -92,6 +99,9 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { fi_datatype = FI_UINT32; @@ -100,6 +110,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t fi_op = to_fi_op(op); comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, NULL, NULL, cbfunc, cbcontext, cbdata, @@ -110,7 +121,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t remote_address = (remote_address - (uint64_t) remote_handle->base_addr); - rc = fi_atomic(ofi_btl->ofi_endpoint, + rc = fi_atomic(ofi_context->tx_ctx, (void*) &comp->operand, 1, NULL, /* operand */ btl_endpoint->peer_addr, /* remote addr */ remote_address, remote_handle->rkey, /* remote buffer */ @@ -124,6 +135,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t } MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + mca_btl_ofi_component.super.btl_progress(); return OPAL_SUCCESS; } @@ -139,12 +151,16 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { fi_datatype = FI_UINT32; } comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -157,7 +173,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e remote_address = (remote_address - (uint64_t) remote_handle->base_addr); /* perform atomic */ - rc = fi_compare_atomic(ofi_btl->ofi_endpoint, + rc = fi_compare_atomic(ofi_context->tx_ctx, (void*) &comp->operand, 1, NULL, (void*) &comp->compare, NULL, local_address, local_handle->desc, @@ -176,5 +192,8 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + /* force a bit of progress. */ + mca_btl_ofi_component.super.btl_progress(); + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 840d0dfdb8..50ad897ff2 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -31,15 +31,16 @@ #include #include "btl_ofi.h" +#include "btl_ofi_endpoint.h" #include "btl_ofi_rdma.h" - #define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC) #define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) static char *prov_include; static char *prov_exclude; static char *ofi_progress_mode; +static bool disable_sep; static int mca_btl_ofi_init_device(struct fi_info *info); /* validate information returned from fi_getinfo(). @@ -124,16 +125,24 @@ static int mca_btl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ofi_progress_mode); -#if OPAL_C_HAVE__THREAD_LOCAL - mca_btl_ofi_component.bind_threads_to_contexts = true; + mca_btl_ofi_component.num_contexts_per_module = 1; (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, - "bind_threads_to_contexts", "Bind threads to device contexts. " - "In general this should improve the multi-threaded performance " - "when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL, - NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_ALL, - &mca_btl_ofi_component.bind_threads_to_contexts); -#endif + "num_contexts_per_module", + "number of communication context per module to create. " + "This should increase multithreaded performance but it is " + "advised that this number should be lower than total cores.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.num_contexts_per_module); + disable_sep = false; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "disable_sep", + "force btl/ofi to never use scalable endpoint. ", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &disable_sep); /* for now we want this component to lose to btl/ugni and btl/vader */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; @@ -148,7 +157,6 @@ static int mca_btl_ofi_component_open(void) return OPAL_SUCCESS; } - /* * component cleanup - sanity checking of queue lengths */ @@ -289,21 +297,34 @@ static int mca_btl_ofi_init_device(struct fi_info *info) int rc; int *module_count = &mca_btl_ofi_component.module_count; size_t namelen; - mca_btl_ofi_module_t *module; + size_t num_contexts_to_create; char *linux_device_name; char ep_name[FI_NAME_MAX]; + struct fi_info *ofi_info; - struct fi_cq_attr cq_attr = {0}; + struct fi_ep_attr *ep_attr; + struct fi_domain_attr *domain_attr; struct fi_av_attr av_attr = {0}; struct fid_fabric *fabric = NULL; struct fid_domain *domain = NULL; - struct fid_ep *endpoint = NULL; - struct fid_cq *cq = NULL; + struct fid_ep *ep = NULL; struct fid_av *av = NULL; + mca_btl_ofi_module_t *module; + + /* allocate module */ + module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); + if (NULL == module) { + BTL_ERROR(("failed to allocate memory for OFI module")); + goto fail; + } + *module = mca_btl_ofi_module_template; + /* make a copy of the given info to store on the module */ ofi_info = fi_dupinfo(info); + ep_attr = ofi_info->ep_attr; + domain_attr = ofi_info->domain_attr; linux_device_name = info->domain_attr->name; BTL_VERBOSE(("initializing dev:%s provider:%s", @@ -330,28 +351,6 @@ static int mca_btl_ofi_init_device(struct fi_info *info) goto fail; } - /* endpoint */ - rc = fi_endpoint(domain, ofi_info, &endpoint, NULL); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_endpoint with err=%s", - linux_device_name, - fi_strerror(-rc) - )); - goto fail; - } - - /* CQ */ - cq_attr.format = FI_CQ_FORMAT_CONTEXT; - cq_attr.wait_obj = FI_WAIT_NONE; - rc = fi_cq_open(domain, &cq_attr, &cq, NULL); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_cq_open with err=%s", - linux_device_name, - fi_strerror(-rc) - )); - goto fail; - } - /* AV */ av_attr.type = FI_AV_MAP; rc = fi_av_open(domain, &av_attr, &av, NULL); @@ -363,29 +362,76 @@ static int mca_btl_ofi_init_device(struct fi_info *info) goto fail; } + num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module; - /* bind CQ and AV to endpoint */ - uint32_t cq_flags = (FI_TRANSMIT); - rc = fi_ep_bind(endpoint, (fid_t)cq, cq_flags); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", - linux_device_name, - fi_strerror(-rc) - )); - goto fail; + /* If the domain support scalable endpoint. */ + if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) { + + BTL_VERBOSE(("btl/ofi using scalable endpoint.")); + + if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) { + BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)", + module->num_contexts, + domain_attr->max_ep_tx_ctx)); + goto fail; + } + + /* modify the info to let the provider know we are creating x contexts */ + ep_attr->tx_ctx_cnt = num_contexts_to_create; + ep_attr->rx_ctx_cnt = num_contexts_to_create; + + /* create scalable endpoint */ + rc = fi_scalable_ep(domain, ofi_info, &ep, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + module->num_contexts = num_contexts_to_create; + module->is_scalable_ep = true; + + /* create contexts */ + module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info, + domain, ep, av, + num_contexts_to_create); + + } else { + /* warn the user if they want more than 1 context */ + if (num_contexts_to_create > 1) { + BTL_ERROR(("cannot create %zu contexts as the provider does not support " + "scalable endpoint. Falling back to single context endpoint.", + num_contexts_to_create)); + } + + BTL_VERBOSE(("btl/ofi using normal endpoint.")); + + rc = fi_endpoint(domain, ofi_info, &ep, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_endpoint with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + module->num_contexts = 1; + module->is_scalable_ep = false; + + /* create contexts */ + module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info, + domain, ep, av); } - rc = fi_ep_bind(endpoint, (fid_t)av, 0); - if (0 != rc) { - BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", - linux_device_name, - fi_strerror(-rc) - )); + if (NULL == module->contexts) { + /* error message is already printed */ goto fail; } /* enable the endpoint for using */ - rc = fi_enable(endpoint); + rc = fi_enable(ep); if (0 != rc) { BTL_VERBOSE(("%s failed fi_enable with err=%s", linux_device_name, @@ -395,19 +441,12 @@ static int mca_btl_ofi_init_device(struct fi_info *info) } /* Everything succeeded, lets create a module for this device. */ - module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); - if (NULL == module) { - goto fail; - } - *module = mca_btl_ofi_module_template; - /* store the information. */ module->fabric_info = ofi_info; module->fabric = fabric; module->domain = domain; - module->cq = cq; module->av = av; - module->ofi_endpoint = endpoint; + module->ofi_endpoint = ep; module->linux_device_name = linux_device_name; module->outstanding_rdma = 0; module->use_virt_addr = false; @@ -420,29 +459,13 @@ static int mca_btl_ofi_init_device(struct fi_info *info) /* initialize the rcache */ mca_btl_ofi_rcache_init(module); + /* create endpoint list */ OBJ_CONSTRUCT(&module->endpoints, opal_list_t); - - /* init free lists */ - OBJ_CONSTRUCT(&module->comp_list, opal_free_list_t); - rc = opal_free_list_init(&module->comp_list, - sizeof(mca_btl_ofi_completion_t), - opal_cache_line_size, - OBJ_CLASS(mca_btl_ofi_completion_t), - 0, - 0, - 128, - -1, - 128, - NULL, - 0, - NULL, - NULL, - NULL); - assert(OPAL_SUCCESS == rc); + OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t); /* create and send the modex for this device */ namelen = sizeof(ep_name); - rc = fi_getname((fid_t)endpoint, &ep_name[0], &namelen); + rc = fi_getname((fid_t)ep, &ep_name[0], &namelen); if (0 != rc) { BTL_VERBOSE(("%s failed fi_getname with err=%s", linux_device_name, @@ -466,15 +489,20 @@ static int mca_btl_ofi_init_device(struct fi_info *info) fail: /* clean up */ + + /* if the contexts have not been initiated, num_contexts should + * be zero and we skip this. */ + for (int i=0; i < module->num_contexts; i++) { + mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep); + } + free(module->contexts); + if (NULL != av) { fi_close(&av->fid); } - if (NULL != cq) { - fi_close(&cq->fid); - } - if (NULL != endpoint) { - fi_close(&endpoint->fid); + if (NULL != ep) { + fi_close(&ep->fid); } if (NULL != domain) { @@ -484,12 +512,12 @@ fail: if (NULL != fabric) { fi_close(&fabric->fid); } + free(module); /* not really a failure. just skip this device. */ return OPAL_ERR_OUT_OF_RESOURCE; } - /** * @brief OFI BTL progress function * @@ -497,6 +525,44 @@ fail: */ static int mca_btl_ofi_component_progress (void) { + int events = 0; + mca_btl_ofi_context_t *context; + + for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { + mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i]; + + /* progress context we own first. */ + context = get_ofi_context(module); + + if (mca_btl_ofi_context_trylock(context)) { + events += mca_btl_ofi_context_progress(context); + mca_btl_ofi_context_unlock(context); + } + + /* if there is nothing to do, try progress other's. */ + if (events == 0) { + for (int j = 0 ; j < module->num_contexts ; j++ ) { + + context = get_ofi_context_rr(module); + + if (mca_btl_ofi_context_trylock(context)) { + events += mca_btl_ofi_context_progress(context); + mca_btl_ofi_context_unlock(context); + } + + /* If we did something, good enough. return now. + * This is crucial for performance/latency. */ + if (events > 0) { + break; + } + } + } + } + + return events; +} + +int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { int ret = 0; int events_read; @@ -506,72 +572,66 @@ static int mca_btl_ofi_component_progress (void) mca_btl_ofi_completion_t *comp; - for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { - mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i]; + ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); - ret = fi_cq_read(module->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); + if (0 < ret) { + events_read = ret; + for (int i = 0; i < events_read; i++) { + if (NULL != cq_entry[i].op_context) { + ++events; + comp = (mca_btl_ofi_completion_t*) cq_entry[i].op_context; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl; - if (0 < ret) { - events_read = ret; - for (int j = 0; j < events_read; j++) { - if (NULL != cq_entry[j].op_context) { - ++events; - comp = (mca_btl_ofi_completion_t*) cq_entry[j].op_context; - mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl; + switch (comp->type) { + case MCA_BTL_OFI_TYPE_GET: + case MCA_BTL_OFI_TYPE_PUT: + case MCA_BTL_OFI_TYPE_AOP: + case MCA_BTL_OFI_TYPE_AFOP: + case MCA_BTL_OFI_TYPE_CSWAP: - switch (comp->type) { - case MCA_BTL_OFI_TYPE_GET: - case MCA_BTL_OFI_TYPE_PUT: - case MCA_BTL_OFI_TYPE_AOP: - case MCA_BTL_OFI_TYPE_AFOP: - case MCA_BTL_OFI_TYPE_CSWAP: - - /* call the callback */ - if (comp->cbfunc) { - comp->cbfunc (comp->btl, comp->endpoint, - comp->local_address, comp->local_handle, - comp->cbcontext, comp->cbdata, OPAL_SUCCESS); - } - - /* return the completion handler */ - opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); - - MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl); - break; - - default: - /* catasthrophic */ - BTL_ERROR(("unknown completion type")); - MCA_BTL_OFI_ABORT(); + /* call the callback */ + if (comp->cbfunc) { + comp->cbfunc (comp->btl, comp->endpoint, + comp->local_address, comp->local_handle, + comp->cbcontext, comp->cbdata, OPAL_SUCCESS); } + + /* return the completion handler */ + opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); + + MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl); + break; + + default: + /* catasthrophic */ + BTL_ERROR(("unknown completion type")); + MCA_BTL_OFI_ABORT(); } } - } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { - ret = fi_cq_readerr(module->cq, &cqerr, 0); - - /* cq readerr failed!? */ - if (0 > ret) { - BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", - __FILE__, __LINE__, fi_strerror(-ret), ret)); - } else { - BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", - cqerr.prov_errno)); - } - - MCA_BTL_OFI_ABORT(); - } + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { + ret = fi_cq_readerr(context->cq, &cqerr, 0); + + /* cq readerr failed!? */ + if (0 > ret) { + BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret)); + } else { + BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", + cqerr.prov_errno)); + } + MCA_BTL_OFI_ABORT(); + } #ifdef FI_EINTR - /* sometimes, sockets provider complain about interupt. */ - else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { - continue; - } + /* sometimes, sockets provider complain about interupt. We do nothing. */ + else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { + + } #endif - /* If the error is not FI_EAGAIN, report the error and abort. */ - else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { - BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); - MCA_BTL_OFI_ABORT(); - } + /* If the error is not FI_EAGAIN, report the error and abort. */ + else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { + BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); + MCA_BTL_OFI_ABORT(); } return events; @@ -593,5 +653,5 @@ mca_btl_ofi_component_t mca_btl_ofi_component = { .btl_init = mca_btl_ofi_component_init, .btl_progress = mca_btl_ofi_component_progress, - } + }, }; diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.c b/opal/mca/btl/ofi/btl_ofi_endpoint.c index 871d594ddf..f9552bf106 100644 --- a/opal/mca/btl/ofi/btl_ofi_endpoint.c +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.c @@ -15,6 +15,10 @@ #include "btl_ofi_endpoint.h" #include "opal/util/proc.h" +#if OPAL_HAVE_THREAD_LOCAL +opal_thread_local mca_btl_ofi_context_t *my_context = NULL; +#endif /* OPAL_HAVE_THREAD_LOCAL */ + static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint) { endpoint->peer_addr = 0; @@ -49,3 +53,290 @@ mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct return (mca_btl_base_endpoint_t *) endpoint; } +int ofi_comp_list_init(opal_free_list_t *comp_list) +{ + int rc; + OBJ_CONSTRUCT(comp_list, opal_free_list_t); + rc = opal_free_list_init(comp_list, + sizeof(mca_btl_ofi_completion_t), + opal_cache_line_size, + OBJ_CLASS(mca_btl_ofi_completion_t), + 0, + 0, + 128, + -1, + 128, + NULL, + 0, + NULL, + NULL, + NULL); + if (rc != OPAL_SUCCESS) { + BTL_VERBOSE(("cannot allocate completion freelist")); + } + return rc; +} + +/* mca_btl_ofi_context_alloc_normal() + * + * This function will allocate an ofi_context, map the endpoint to tx/rx context, + * bind CQ,AV to the endpoint and initialize all the structure. + * USE WITH NORMAL ENDPOINT ONLY */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *ep, + struct fid_av *av) +{ + int rc; + uint32_t cq_flags = FI_TRANSMIT; + char *linux_device_name = info->domain_attr->name; + + struct fi_cq_attr cq_attr = {0}; + + mca_btl_ofi_context_t *context; + + context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context)); + if (NULL == context) { + BTL_VERBOSE(("cannot allocate context")); + return NULL; + } + + /* Don't really need to check, just avoiding compiler warning because + * BTL_VERBOSE is a no op in performance build and the compiler will + * complain about unused variable. */ + if (NULL == linux_device_name) { + BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); + goto single_fail; + } + + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = fi_ep_bind(ep, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = ofi_comp_list_init(&context->comp_list); + if (rc != OPAL_SUCCESS) { + goto single_fail; + } + + context->tx_ctx = ep; + context->rx_ctx = ep; + context->context_id = 0; + + return context; + +single_fail: + mca_btl_ofi_context_finalize(context, false); + return NULL; +} + +/* mca_btl_ofi_context_alloc_scalable() + * + * This function allocate communication contexts and return the pointer + * to the first btl context. It also take care of all the bindings needed. + * USE WITH SCALABLE ENDPOINT ONLY */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *sep, + struct fid_av *av, + size_t num_contexts) +{ + BTL_VERBOSE(("creating %zu contexts", num_contexts)); + + int rc; + size_t i; + char *linux_device_name = info->domain_attr->name; + + struct fi_cq_attr cq_attr = {0}; + struct fi_tx_attr tx_attr = {0}; + struct fi_rx_attr rx_attr = {0}; + + mca_btl_ofi_context_t *contexts; + + contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts)); + if (NULL == contexts) { + BTL_VERBOSE(("cannot allocate communication contexts.")); + return NULL; + } + + /* Don't really need to check, just avoiding compiler warning because + * BTL_VERBOSE is a no op in performance build and the compiler will + * complain about unused variable. */ + if (NULL == linux_device_name) { + BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); + goto scalable_fail; + } + + /* bind AV to endpoint */ + rc = fi_scalable_ep_bind(sep, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + for (i=0; i < num_contexts; i++) { + rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_tx_context with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* We don't actually need a receiving context as we only do one-sided. + * However, sockets provider will hang if we dont have one. It is + * also nice to have equal number of tx/rx context. */ + rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_rx_context with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* create CQ */ + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* bind cq to transmit context */ + uint32_t cq_flags = (FI_TRANSMIT); + rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* enable the context. */ + rc = fi_enable(contexts[i].tx_ctx); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + rc = fi_enable(contexts[i].rx_ctx); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* initialize completion freelist. */ + rc = ofi_comp_list_init(&contexts[i].comp_list); + if (rc != OPAL_SUCCESS) { + goto scalable_fail; + } + + /* assign the id */ + contexts[i].context_id = i; + } + + return contexts; + +scalable_fail: + /* close and free */ + for(i=0; i < num_contexts; i++) { + mca_btl_ofi_context_finalize(&contexts[i], true); + } + free(contexts); + + return NULL; +} + +void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) { + + /* if it is a scalable ep, we have to close all contexts. */ + if (scalable_ep) { + if (NULL != context->tx_ctx) { + fi_close(&context->tx_ctx->fid); + } + + if (NULL != context->rx_ctx) { + fi_close(&context->rx_ctx->fid); + } + } + + if( NULL != context->cq) { + fi_close(&context->cq->fid); + } + + /* Can we destruct the object that hasn't been constructed? */ + OBJ_DESTRUCT(&context->comp_list); +} + +/* Get a context to use for communication. + * If TLS is supported, it will use the cached endpoint. + * If not, it will invoke the normal round-robin assignment. */ +mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl) +{ +#if OPAL_HAVE_THREAD_LOCAL + /* With TLS, we cache the context we use. */ + static volatile int64_t cur_num = 0; + + if (OPAL_UNLIKELY(my_context == NULL)) { + OPAL_THREAD_LOCK(&btl->module_lock); + + my_context = &btl->contexts[cur_num]; + cur_num = (cur_num + 1) %btl->num_contexts; + + OPAL_THREAD_UNLOCK(&btl->module_lock); + } + + assert (my_context); + return my_context; +#else + return get_ofi_context_rr(btl); +#endif +} + +/* return the context in a round-robin. */ +/* There is no need for atomics here as it might hurt the performance. */ +mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl) +{ + static volatile uint64_t rr_num = 0; + return &btl->contexts[rr_num++%btl->num_contexts]; +} diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.h b/opal/mca/btl/ofi/btl_ofi_endpoint.h index f131f72b1d..aad758d8c8 100644 --- a/opal/mca/btl/ofi/btl_ofi_endpoint.h +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.h @@ -30,6 +30,10 @@ BEGIN_C_DECLS +#if OPAL_HAVE_THREAD_LOCAL +extern opal_thread_local mca_btl_ofi_context_t *my_context; +#endif /* OPAL_HAVE_THREAD_LOCAL */ + struct mca_btl_base_endpoint_t { opal_list_item_t super; @@ -47,7 +51,25 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t; OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t); +int ofi_comp_list_init(opal_free_list_t *comp_list); + mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep); +/* contexts */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *sep, + struct fid_av *av, + size_t num_contexts); + +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *ep, + struct fid_av *av); +void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep); + +mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl); +mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl); + END_C_DECLS #endif diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index f8d6d6619c..df6ae1e2e1 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -242,14 +242,17 @@ int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) { + int i; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *endpoint, *next; assert(btl); - if (NULL != ofi_btl->cq) { - fi_close(&ofi_btl->cq->fid); + /* loop over all the contexts */ + for (i=0; i < ofi_btl->num_contexts; i++) { + mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep); } + free(ofi_btl->contexts); if (NULL != ofi_btl->av) { fi_close(&ofi_btl->av->fid); @@ -278,7 +281,6 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) } OBJ_DESTRUCT(&ofi_btl->endpoints); - OBJ_DESTRUCT(&ofi_btl->comp_list); if (ofi_btl->rcache) { mca_rcache_base_module_destroy (ofi_btl->rcache); diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.c b/opal/mca/btl/ofi/btl_ofi_rdma.c index 854301006b..8f8719ed68 100644 --- a/opal/mca/btl/ofi/btl_ofi_rdma.c +++ b/opal/mca/btl/ofi/btl_ofi_rdma.c @@ -21,26 +21,31 @@ OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t, mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_ofi_context_t *ofi_context, void *local_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata, int type) { - mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)btl; + assert(btl); + assert(endpoint); + assert(ofi_context); + mca_btl_ofi_completion_t *comp; - comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_btl->comp_list); + comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_context->comp_list); assert(comp); comp->btl = btl; comp->endpoint = endpoint; + comp->my_context = ofi_context; comp->local_address = local_address; comp->local_handle = local_handle; comp->cbfunc = cbfunc; comp->cbcontext = cbcontext; comp->cbdata = cbdata; - comp->my_list = &ofi_btl->comp_list; + comp->my_list = &ofi_context->comp_list; comp->type = type; return comp; @@ -53,12 +58,17 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi { int rc; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_completion_t *comp; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); /* create completion context */ comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -67,7 +77,7 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi remote_address = (remote_address - (uint64_t) remote_handle->base_addr); /* Remote write data across the wire */ - rc = fi_read(ofi_btl->ofi_endpoint, + rc = fi_read(ofi_context->tx_ctx, local_address, size, /* payload */ local_handle->desc, btl_endpoint->peer_addr, @@ -99,10 +109,14 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi int rc; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); /* create completion context */ mca_btl_ofi_completion_t *comp; comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, local_address, local_handle, cbfunc, cbcontext, cbdata, @@ -111,7 +125,7 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi remote_address = (remote_address - (uint64_t) remote_handle->base_addr); /* Remote write data across the wire */ - rc = fi_write(ofi_btl->ofi_endpoint, + rc = fi_write(ofi_context->tx_ctx, local_address, size, /* payload */ local_handle->desc, btl_endpoint->peer_addr, diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.h b/opal/mca/btl/ofi/btl_ofi_rdma.h index 19eab1e54c..343ada3796 100644 --- a/opal/mca/btl/ofi/btl_ofi_rdma.h +++ b/opal/mca/btl/ofi/btl_ofi_rdma.h @@ -22,6 +22,7 @@ mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_ofi_context_t *ofi_context, void *local_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_rdma_completion_fn_t cbfunc,