1
1

btl/ofi: add scalable endpoint support.

This commit add support for scalable endpoint to enhance multithreaded
application performance. The BTL will detect the support from ofi
provider and will fallback to normal usage of scalable endpoint is not
supported.

NEW MCA parameters:
- mca_btl_ofi_disable_sep: force the btl to not use scalable endpoint.
- mca_btl_ofi_num_contexts_per_module: number of communication context
  to create (should be the same as number of thread).

Signed-off-by: Thananon Patinyasakdikul <thananon.patinyasakdikul@intel.com>
Этот коммит содержится в:
Thananon Patinyasakdikul 2018-06-07 09:33:12 -07:00
родитель 08cfacddee
Коммит dae3c9447c
8 изменённых файлов: 610 добавлений и 168 удалений

Просмотреть файл

@ -46,9 +46,7 @@
#include <rdma/fi_rma.h>
BEGIN_C_DECLS
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_MAX_WORKERS 1
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
@ -62,6 +60,26 @@ enum mca_btl_ofi_type {
MCA_BTL_OFI_TYPE_TOTAL
};
struct mca_btl_ofi_context_t {
int32_t context_id;
/* transmit context */
struct fid_ep *tx_ctx;
struct fid_ep *rx_ctx;
/* completion queue */
struct fid_cq *cq;
/* completion info freelist */
/* We have it per context to reduce the thread contention
* on the freelist. Things can get really slow. */
opal_free_list_t comp_list;
/* for thread locking */
volatile int32_t lock;
};
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
/**
* @brief OFI BTL module
*/
@ -74,17 +92,17 @@ struct mca_btl_ofi_module_t {
struct fid_fabric *fabric;
struct fid_domain *domain;
struct fid_ep *ofi_endpoint;
struct fid_cq *cq;
struct fid_av *av;
int num_contexts;
mca_btl_ofi_context_t *contexts;
char *linux_device_name;
/** whether the module has been fully initialized or not */
bool initialized;
bool use_virt_addr;
/** spin-lock to protect the module */
volatile int32_t lock;
bool is_scalable_ep;
int64_t outstanding_rdma;
@ -92,8 +110,7 @@ struct mca_btl_ofi_module_t {
* there is no need for a complicated structure here at this time*/
opal_list_t endpoints;
/* free lists */
opal_free_list_t comp_list;
opal_mutex_t module_lock;
/** registration cache */
mca_rcache_base_module_t *rcache;
@ -110,6 +127,7 @@ struct mca_btl_ofi_component_t {
/** number of TL modules */
int module_count;
int num_contexts_per_module;
int num_cqe_read;
size_t namelen;
@ -117,10 +135,6 @@ struct mca_btl_ofi_component_t {
/** All BTL OFI modules (1 per tl) */
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
#if OPAL_C_HAVE__THREAD_LOCAL
/** bind threads to contexts */
bool bind_threads_to_contexts;
#endif
};
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
@ -151,6 +165,7 @@ struct mca_btl_ofi_completion_t {
struct mca_btl_base_module_t *btl;
struct mca_btl_base_endpoint_t *endpoint;
struct mca_btl_ofi_context_t *my_context;
uint32_t type;
void *local_address;
@ -269,7 +284,25 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
mca_rcache_base_registration_t *reg);
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
void mca_btl_ofi_exit(void);
/* thread atomics */
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
{
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
}
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
{
while (mca_btl_ofi_context_trylock(context));
}
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
{
opal_atomic_mb();
context->lock = 0;
}
END_C_DECLS
#endif

Просмотреть файл

@ -43,6 +43,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32;
@ -51,6 +54,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
fi_op = to_fi_op(op);
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
@ -61,7 +65,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
rc = fi_fetch_atomic(ofi_btl->ofi_endpoint,
rc = fi_fetch_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, /* operand */
local_address, local_handle->desc, /* results */
btl_endpoint->peer_addr, /* remote addr */
@ -77,6 +81,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress. */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}
@ -92,6 +99,9 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32;
@ -100,6 +110,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
fi_op = to_fi_op(op);
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
NULL,
NULL,
cbfunc, cbcontext, cbdata,
@ -110,7 +121,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
rc = fi_atomic(ofi_btl->ofi_endpoint,
rc = fi_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, /* operand */
btl_endpoint->peer_addr, /* remote addr */
remote_address, remote_handle->rkey, /* remote buffer */
@ -124,6 +135,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}
@ -139,12 +151,16 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32;
}
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
@ -157,7 +173,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* perform atomic */
rc = fi_compare_atomic(ofi_btl->ofi_endpoint,
rc = fi_compare_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL,
(void*) &comp->compare, NULL,
local_address, local_handle->desc,
@ -176,5 +192,8 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress. */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}

Просмотреть файл

@ -31,15 +31,16 @@
#include <string.h>
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "btl_ofi_rdma.h"
#define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
static char *prov_include;
static char *prov_exclude;
static char *ofi_progress_mode;
static bool disable_sep;
static int mca_btl_ofi_init_device(struct fi_info *info);
/* validate information returned from fi_getinfo().
@ -124,16 +125,24 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ofi_progress_mode);
#if OPAL_C_HAVE__THREAD_LOCAL
mca_btl_ofi_component.bind_threads_to_contexts = true;
mca_btl_ofi_component.num_contexts_per_module = 1;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"bind_threads_to_contexts", "Bind threads to device contexts. "
"In general this should improve the multi-threaded performance "
"when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL,
NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_ALL,
&mca_btl_ofi_component.bind_threads_to_contexts);
#endif
"num_contexts_per_module",
"number of communication context per module to create. "
"This should increase multithreaded performance but it is "
"advised that this number should be lower than total cores.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.num_contexts_per_module);
disable_sep = false;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"disable_sep",
"force btl/ofi to never use scalable endpoint. ",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&disable_sep);
/* for now we want this component to lose to btl/ugni and btl/vader */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
@ -148,7 +157,6 @@ static int mca_btl_ofi_component_open(void)
return OPAL_SUCCESS;
}
/*
* component cleanup - sanity checking of queue lengths
*/
@ -289,21 +297,34 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
int rc;
int *module_count = &mca_btl_ofi_component.module_count;
size_t namelen;
mca_btl_ofi_module_t *module;
size_t num_contexts_to_create;
char *linux_device_name;
char ep_name[FI_NAME_MAX];
struct fi_info *ofi_info;
struct fi_cq_attr cq_attr = {0};
struct fi_ep_attr *ep_attr;
struct fi_domain_attr *domain_attr;
struct fi_av_attr av_attr = {0};
struct fid_fabric *fabric = NULL;
struct fid_domain *domain = NULL;
struct fid_ep *endpoint = NULL;
struct fid_cq *cq = NULL;
struct fid_ep *ep = NULL;
struct fid_av *av = NULL;
mca_btl_ofi_module_t *module;
/* allocate module */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
if (NULL == module) {
BTL_ERROR(("failed to allocate memory for OFI module"));
goto fail;
}
*module = mca_btl_ofi_module_template;
/* make a copy of the given info to store on the module */
ofi_info = fi_dupinfo(info);
ep_attr = ofi_info->ep_attr;
domain_attr = ofi_info->domain_attr;
linux_device_name = info->domain_attr->name;
BTL_VERBOSE(("initializing dev:%s provider:%s",
@ -330,28 +351,6 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
goto fail;
}
/* endpoint */
rc = fi_endpoint(domain, ofi_info, &endpoint, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* AV */
av_attr.type = FI_AV_MAP;
rc = fi_av_open(domain, &av_attr, &av, NULL);
@ -363,29 +362,76 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
goto fail;
}
num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module;
/* bind CQ and AV to endpoint */
uint32_t cq_flags = (FI_TRANSMIT);
rc = fi_ep_bind(endpoint, (fid_t)cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
/* If the domain support scalable endpoint. */
if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) {
BTL_VERBOSE(("btl/ofi using scalable endpoint."));
if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) {
BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)",
module->num_contexts,
domain_attr->max_ep_tx_ctx));
goto fail;
}
/* modify the info to let the provider know we are creating x contexts */
ep_attr->tx_ctx_cnt = num_contexts_to_create;
ep_attr->rx_ctx_cnt = num_contexts_to_create;
/* create scalable endpoint */
rc = fi_scalable_ep(domain, ofi_info, &ep, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
module->num_contexts = num_contexts_to_create;
module->is_scalable_ep = true;
/* create contexts */
module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info,
domain, ep, av,
num_contexts_to_create);
} else {
/* warn the user if they want more than 1 context */
if (num_contexts_to_create > 1) {
BTL_ERROR(("cannot create %zu contexts as the provider does not support "
"scalable endpoint. Falling back to single context endpoint.",
num_contexts_to_create));
}
BTL_VERBOSE(("btl/ofi using normal endpoint."));
rc = fi_endpoint(domain, ofi_info, &ep, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
module->num_contexts = 1;
module->is_scalable_ep = false;
/* create contexts */
module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info,
domain, ep, av);
}
rc = fi_ep_bind(endpoint, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
if (NULL == module->contexts) {
/* error message is already printed */
goto fail;
}
/* enable the endpoint for using */
rc = fi_enable(endpoint);
rc = fi_enable(ep);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
@ -395,19 +441,12 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
}
/* Everything succeeded, lets create a module for this device. */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
if (NULL == module) {
goto fail;
}
*module = mca_btl_ofi_module_template;
/* store the information. */
module->fabric_info = ofi_info;
module->fabric = fabric;
module->domain = domain;
module->cq = cq;
module->av = av;
module->ofi_endpoint = endpoint;
module->ofi_endpoint = ep;
module->linux_device_name = linux_device_name;
module->outstanding_rdma = 0;
module->use_virt_addr = false;
@ -420,29 +459,13 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
/* initialize the rcache */
mca_btl_ofi_rcache_init(module);
/* create endpoint list */
OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
/* init free lists */
OBJ_CONSTRUCT(&module->comp_list, opal_free_list_t);
rc = opal_free_list_init(&module->comp_list,
sizeof(mca_btl_ofi_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_completion_t),
0,
0,
128,
-1,
128,
NULL,
0,
NULL,
NULL,
NULL);
assert(OPAL_SUCCESS == rc);
OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t);
/* create and send the modex for this device */
namelen = sizeof(ep_name);
rc = fi_getname((fid_t)endpoint, &ep_name[0], &namelen);
rc = fi_getname((fid_t)ep, &ep_name[0], &namelen);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_getname with err=%s",
linux_device_name,
@ -466,15 +489,20 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
fail:
/* clean up */
/* if the contexts have not been initiated, num_contexts should
* be zero and we skip this. */
for (int i=0; i < module->num_contexts; i++) {
mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep);
}
free(module->contexts);
if (NULL != av) {
fi_close(&av->fid);
}
if (NULL != cq) {
fi_close(&cq->fid);
}
if (NULL != endpoint) {
fi_close(&endpoint->fid);
if (NULL != ep) {
fi_close(&ep->fid);
}
if (NULL != domain) {
@ -484,12 +512,12 @@ fail:
if (NULL != fabric) {
fi_close(&fabric->fid);
}
free(module);
/* not really a failure. just skip this device. */
return OPAL_ERR_OUT_OF_RESOURCE;
}
/**
* @brief OFI BTL progress function
*
@ -497,6 +525,44 @@ fail:
*/
static int mca_btl_ofi_component_progress (void)
{
int events = 0;
mca_btl_ofi_context_t *context;
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
/* progress context we own first. */
context = get_ofi_context(module);
if (mca_btl_ofi_context_trylock(context)) {
events += mca_btl_ofi_context_progress(context);
mca_btl_ofi_context_unlock(context);
}
/* if there is nothing to do, try progress other's. */
if (events == 0) {
for (int j = 0 ; j < module->num_contexts ; j++ ) {
context = get_ofi_context_rr(module);
if (mca_btl_ofi_context_trylock(context)) {
events += mca_btl_ofi_context_progress(context);
mca_btl_ofi_context_unlock(context);
}
/* If we did something, good enough. return now.
* This is crucial for performance/latency. */
if (events > 0) {
break;
}
}
}
}
return events;
}
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
int ret = 0;
int events_read;
@ -506,72 +572,66 @@ static int mca_btl_ofi_component_progress (void)
mca_btl_ofi_completion_t *comp;
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
ret = fi_cq_read(module->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
if (0 < ret) {
events_read = ret;
for (int i = 0; i < events_read; i++) {
if (NULL != cq_entry[i].op_context) {
++events;
comp = (mca_btl_ofi_completion_t*) cq_entry[i].op_context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl;
if (0 < ret) {
events_read = ret;
for (int j = 0; j < events_read; j++) {
if (NULL != cq_entry[j].op_context) {
++events;
comp = (mca_btl_ofi_completion_t*) cq_entry[j].op_context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl;
switch (comp->type) {
case MCA_BTL_OFI_TYPE_GET:
case MCA_BTL_OFI_TYPE_PUT:
case MCA_BTL_OFI_TYPE_AOP:
case MCA_BTL_OFI_TYPE_AFOP:
case MCA_BTL_OFI_TYPE_CSWAP:
switch (comp->type) {
case MCA_BTL_OFI_TYPE_GET:
case MCA_BTL_OFI_TYPE_PUT:
case MCA_BTL_OFI_TYPE_AOP:
case MCA_BTL_OFI_TYPE_AFOP:
case MCA_BTL_OFI_TYPE_CSWAP:
/* call the callback */
if (comp->cbfunc) {
comp->cbfunc (comp->btl, comp->endpoint,
comp->local_address, comp->local_handle,
comp->cbcontext, comp->cbdata, OPAL_SUCCESS);
}
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
/* call the callback */
if (comp->cbfunc) {
comp->cbfunc (comp->btl, comp->endpoint,
comp->local_address, comp->local_handle,
comp->cbcontext, comp->cbdata, OPAL_SUCCESS);
}
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
}
}
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(module->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
}
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(context->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
}
#ifdef FI_EINTR
/* sometimes, sockets provider complain about interupt. */
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
continue;
}
/* sometimes, sockets provider complain about interupt. We do nothing. */
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
}
#endif
/* If the error is not FI_EAGAIN, report the error and abort. */
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
MCA_BTL_OFI_ABORT();
}
/* If the error is not FI_EAGAIN, report the error and abort. */
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
MCA_BTL_OFI_ABORT();
}
return events;
@ -593,5 +653,5 @@ mca_btl_ofi_component_t mca_btl_ofi_component = {
.btl_init = mca_btl_ofi_component_init,
.btl_progress = mca_btl_ofi_component_progress,
}
},
};

Просмотреть файл

@ -15,6 +15,10 @@
#include "btl_ofi_endpoint.h"
#include "opal/util/proc.h"
#if OPAL_HAVE_THREAD_LOCAL
opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
#endif /* OPAL_HAVE_THREAD_LOCAL */
static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint)
{
endpoint->peer_addr = 0;
@ -49,3 +53,290 @@ mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct
return (mca_btl_base_endpoint_t *) endpoint;
}
int ofi_comp_list_init(opal_free_list_t *comp_list)
{
int rc;
OBJ_CONSTRUCT(comp_list, opal_free_list_t);
rc = opal_free_list_init(comp_list,
sizeof(mca_btl_ofi_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_completion_t),
0,
0,
128,
-1,
128,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
}
return rc;
}
/* mca_btl_ofi_context_alloc_normal()
*
* This function will allocate an ofi_context, map the endpoint to tx/rx context,
* bind CQ,AV to the endpoint and initialize all the structure.
* USE WITH NORMAL ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av)
{
int rc;
uint32_t cq_flags = FI_TRANSMIT;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
mca_btl_ofi_context_t *context;
context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
if (NULL == context) {
BTL_VERBOSE(("cannot allocate context"));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto single_fail;
}
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = ofi_comp_list_init(&context->comp_list);
if (rc != OPAL_SUCCESS) {
goto single_fail;
}
context->tx_ctx = ep;
context->rx_ctx = ep;
context->context_id = 0;
return context;
single_fail:
mca_btl_ofi_context_finalize(context, false);
return NULL;
}
/* mca_btl_ofi_context_alloc_scalable()
*
* This function allocate communication contexts and return the pointer
* to the first btl context. It also take care of all the bindings needed.
* USE WITH SCALABLE ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts)
{
BTL_VERBOSE(("creating %zu contexts", num_contexts));
int rc;
size_t i;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_rx_attr rx_attr = {0};
mca_btl_ofi_context_t *contexts;
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
if (NULL == contexts) {
BTL_VERBOSE(("cannot allocate communication contexts."));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto scalable_fail;
}
/* bind AV to endpoint */
rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
for (i=0; i < num_contexts; i++) {
rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* We don't actually need a receiving context as we only do one-sided.
* However, sockets provider will hang if we dont have one. It is
* also nice to have equal number of tx/rx context. */
rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* create CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to transmit context */
uint32_t cq_flags = (FI_TRANSMIT);
rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* enable the context. */
rc = fi_enable(contexts[i].tx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
rc = fi_enable(contexts[i].rx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* initialize completion freelist. */
rc = ofi_comp_list_init(&contexts[i].comp_list);
if (rc != OPAL_SUCCESS) {
goto scalable_fail;
}
/* assign the id */
contexts[i].context_id = i;
}
return contexts;
scalable_fail:
/* close and free */
for(i=0; i < num_contexts; i++) {
mca_btl_ofi_context_finalize(&contexts[i], true);
}
free(contexts);
return NULL;
}
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
/* if it is a scalable ep, we have to close all contexts. */
if (scalable_ep) {
if (NULL != context->tx_ctx) {
fi_close(&context->tx_ctx->fid);
}
if (NULL != context->rx_ctx) {
fi_close(&context->rx_ctx->fid);
}
}
if( NULL != context->cq) {
fi_close(&context->cq->fid);
}
/* Can we destruct the object that hasn't been constructed? */
OBJ_DESTRUCT(&context->comp_list);
}
/* Get a context to use for communication.
* If TLS is supported, it will use the cached endpoint.
* If not, it will invoke the normal round-robin assignment. */
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
{
#if OPAL_HAVE_THREAD_LOCAL
/* With TLS, we cache the context we use. */
static volatile int64_t cur_num = 0;
if (OPAL_UNLIKELY(my_context == NULL)) {
OPAL_THREAD_LOCK(&btl->module_lock);
my_context = &btl->contexts[cur_num];
cur_num = (cur_num + 1) %btl->num_contexts;
OPAL_THREAD_UNLOCK(&btl->module_lock);
}
assert (my_context);
return my_context;
#else
return get_ofi_context_rr(btl);
#endif
}
/* return the context in a round-robin. */
/* There is no need for atomics here as it might hurt the performance. */
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
{
static volatile uint64_t rr_num = 0;
return &btl->contexts[rr_num++%btl->num_contexts];
}

Просмотреть файл

@ -30,6 +30,10 @@
BEGIN_C_DECLS
#if OPAL_HAVE_THREAD_LOCAL
extern opal_thread_local mca_btl_ofi_context_t *my_context;
#endif /* OPAL_HAVE_THREAD_LOCAL */
struct mca_btl_base_endpoint_t {
opal_list_item_t super;
@ -47,7 +51,25 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t);
int ofi_comp_list_init(opal_free_list_t *comp_list);
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep);
/* contexts */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts);
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av);
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep);
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl);
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl);
END_C_DECLS
#endif

Просмотреть файл

@ -242,14 +242,17 @@ int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
{
int i;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *endpoint, *next;
assert(btl);
if (NULL != ofi_btl->cq) {
fi_close(&ofi_btl->cq->fid);
/* loop over all the contexts */
for (i=0; i < ofi_btl->num_contexts; i++) {
mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
}
free(ofi_btl->contexts);
if (NULL != ofi_btl->av) {
fi_close(&ofi_btl->av->fid);
@ -278,7 +281,6 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
}
OBJ_DESTRUCT(&ofi_btl->endpoints);
OBJ_DESTRUCT(&ofi_btl->comp_list);
if (ofi_btl->rcache) {
mca_rcache_base_module_destroy (ofi_btl->rcache);

Просмотреть файл

@ -21,26 +21,31 @@ OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t,
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
void *local_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata,
int type)
{
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)btl;
assert(btl);
assert(endpoint);
assert(ofi_context);
mca_btl_ofi_completion_t *comp;
comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_btl->comp_list);
comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_context->comp_list);
assert(comp);
comp->btl = btl;
comp->endpoint = endpoint;
comp->my_context = ofi_context;
comp->local_address = local_address;
comp->local_handle = local_handle;
comp->cbfunc = cbfunc;
comp->cbcontext = cbcontext;
comp->cbdata = cbdata;
comp->my_list = &ofi_btl->comp_list;
comp->my_list = &ofi_context->comp_list;
comp->type = type;
return comp;
@ -53,12 +58,17 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
{
int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
@ -67,7 +77,7 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* Remote write data across the wire */
rc = fi_read(ofi_btl->ofi_endpoint,
rc = fi_read(ofi_context->tx_ctx,
local_address, size, /* payload */
local_handle->desc,
btl_endpoint->peer_addr,
@ -99,10 +109,14 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */
mca_btl_ofi_completion_t *comp;
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
@ -111,7 +125,7 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* Remote write data across the wire */
rc = fi_write(ofi_btl->ofi_endpoint,
rc = fi_write(ofi_context->tx_ctx,
local_address, size, /* payload */
local_handle->desc,
btl_endpoint->peer_addr,

Просмотреть файл

@ -22,6 +22,7 @@
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
void *local_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,