1
1

Merge pull request #5274 from thananon/ofi_sep

btl/ofi: add scalable endpoint support.
Этот коммит содержится в:
Thananon Patinyasakdikul 2018-06-18 08:41:06 -07:00 коммит произвёл GitHub
родитель 266d5b2110 dae3c9447c
Коммит 13f58f3191
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 610 добавлений и 168 удалений

Просмотреть файл

@ -46,9 +46,7 @@
#include <rdma/fi_rma.h> #include <rdma/fi_rma.h>
BEGIN_C_DECLS BEGIN_C_DECLS
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_MAX_WORKERS 1
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128 #define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args) #define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
@ -62,6 +60,26 @@ enum mca_btl_ofi_type {
MCA_BTL_OFI_TYPE_TOTAL MCA_BTL_OFI_TYPE_TOTAL
}; };
struct mca_btl_ofi_context_t {
int32_t context_id;
/* transmit context */
struct fid_ep *tx_ctx;
struct fid_ep *rx_ctx;
/* completion queue */
struct fid_cq *cq;
/* completion info freelist */
/* We have it per context to reduce the thread contention
* on the freelist. Things can get really slow. */
opal_free_list_t comp_list;
/* for thread locking */
volatile int32_t lock;
};
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
/** /**
* @brief OFI BTL module * @brief OFI BTL module
*/ */
@ -74,17 +92,17 @@ struct mca_btl_ofi_module_t {
struct fid_fabric *fabric; struct fid_fabric *fabric;
struct fid_domain *domain; struct fid_domain *domain;
struct fid_ep *ofi_endpoint; struct fid_ep *ofi_endpoint;
struct fid_cq *cq;
struct fid_av *av; struct fid_av *av;
int num_contexts;
mca_btl_ofi_context_t *contexts;
char *linux_device_name; char *linux_device_name;
/** whether the module has been fully initialized or not */ /** whether the module has been fully initialized or not */
bool initialized; bool initialized;
bool use_virt_addr; bool use_virt_addr;
bool is_scalable_ep;
/** spin-lock to protect the module */
volatile int32_t lock;
int64_t outstanding_rdma; int64_t outstanding_rdma;
@ -92,8 +110,7 @@ struct mca_btl_ofi_module_t {
* there is no need for a complicated structure here at this time*/ * there is no need for a complicated structure here at this time*/
opal_list_t endpoints; opal_list_t endpoints;
/* free lists */ opal_mutex_t module_lock;
opal_free_list_t comp_list;
/** registration cache */ /** registration cache */
mca_rcache_base_module_t *rcache; mca_rcache_base_module_t *rcache;
@ -110,6 +127,7 @@ struct mca_btl_ofi_component_t {
/** number of TL modules */ /** number of TL modules */
int module_count; int module_count;
int num_contexts_per_module;
int num_cqe_read; int num_cqe_read;
size_t namelen; size_t namelen;
@ -117,10 +135,6 @@ struct mca_btl_ofi_component_t {
/** All BTL OFI modules (1 per tl) */ /** All BTL OFI modules (1 per tl) */
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES]; mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
#if OPAL_C_HAVE__THREAD_LOCAL
/** bind threads to contexts */
bool bind_threads_to_contexts;
#endif
}; };
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t; typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
@ -151,6 +165,7 @@ struct mca_btl_ofi_completion_t {
struct mca_btl_base_module_t *btl; struct mca_btl_base_module_t *btl;
struct mca_btl_base_endpoint_t *endpoint; struct mca_btl_base_endpoint_t *endpoint;
struct mca_btl_ofi_context_t *my_context;
uint32_t type; uint32_t type;
void *local_address; void *local_address;
@ -269,7 +284,25 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
mca_rcache_base_registration_t *reg); mca_rcache_base_registration_t *reg);
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
void mca_btl_ofi_exit(void); void mca_btl_ofi_exit(void);
/* thread atomics */
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
{
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
}
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
{
while (mca_btl_ofi_context_trylock(context));
}
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
{
opal_atomic_mb();
context->lock = 0;
}
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -43,6 +43,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL; mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32; fi_datatype = FI_UINT32;
@ -51,6 +54,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
fi_op = to_fi_op(op); fi_op = to_fi_op(op);
comp = mca_btl_ofi_completion_alloc(btl, endpoint, comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address, local_address,
local_handle, local_handle,
cbfunc, cbcontext, cbdata, cbfunc, cbcontext, cbdata,
@ -61,7 +65,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
remote_address = (remote_address - (uint64_t) remote_handle->base_addr); remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
rc = fi_fetch_atomic(ofi_btl->ofi_endpoint, rc = fi_fetch_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, /* operand */ (void*) &comp->operand, 1, NULL, /* operand */
local_address, local_handle->desc, /* results */ local_address, local_handle->desc, /* results */
btl_endpoint->peer_addr, /* remote addr */ btl_endpoint->peer_addr, /* remote addr */
@ -77,6 +81,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress. */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -92,6 +99,9 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL; mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32; fi_datatype = FI_UINT32;
@ -100,6 +110,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
fi_op = to_fi_op(op); fi_op = to_fi_op(op);
comp = mca_btl_ofi_completion_alloc(btl, endpoint, comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
NULL, NULL,
NULL, NULL,
cbfunc, cbcontext, cbdata, cbfunc, cbcontext, cbdata,
@ -110,7 +121,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
remote_address = (remote_address - (uint64_t) remote_handle->base_addr); remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
rc = fi_atomic(ofi_btl->ofi_endpoint, rc = fi_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, /* operand */ (void*) &comp->operand, 1, NULL, /* operand */
btl_endpoint->peer_addr, /* remote addr */ btl_endpoint->peer_addr, /* remote addr */
remote_address, remote_handle->rkey, /* remote buffer */ remote_address, remote_handle->rkey, /* remote buffer */
@ -124,6 +135,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
} }
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -139,12 +151,16 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL; mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
fi_datatype = FI_UINT32; fi_datatype = FI_UINT32;
} }
comp = mca_btl_ofi_completion_alloc(btl, endpoint, comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address, local_address,
local_handle, local_handle,
cbfunc, cbcontext, cbdata, cbfunc, cbcontext, cbdata,
@ -157,7 +173,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
remote_address = (remote_address - (uint64_t) remote_handle->base_addr); remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* perform atomic */ /* perform atomic */
rc = fi_compare_atomic(ofi_btl->ofi_endpoint, rc = fi_compare_atomic(ofi_context->tx_ctx,
(void*) &comp->operand, 1, NULL, (void*) &comp->operand, 1, NULL,
(void*) &comp->compare, NULL, (void*) &comp->compare, NULL,
local_address, local_handle->desc, local_address, local_handle->desc,
@ -176,5 +192,8 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress. */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -31,15 +31,16 @@
#include <string.h> #include <string.h>
#include "btl_ofi.h" #include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "btl_ofi_rdma.h" #include "btl_ofi_rdma.h"
#define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC) #define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) #define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
static char *prov_include; static char *prov_include;
static char *prov_exclude; static char *prov_exclude;
static char *ofi_progress_mode; static char *ofi_progress_mode;
static bool disable_sep;
static int mca_btl_ofi_init_device(struct fi_info *info); static int mca_btl_ofi_init_device(struct fi_info *info);
/* validate information returned from fi_getinfo(). /* validate information returned from fi_getinfo().
@ -124,16 +125,24 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
&ofi_progress_mode); &ofi_progress_mode);
#if OPAL_C_HAVE__THREAD_LOCAL mca_btl_ofi_component.num_contexts_per_module = 1;
mca_btl_ofi_component.bind_threads_to_contexts = true;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"bind_threads_to_contexts", "Bind threads to device contexts. " "num_contexts_per_module",
"In general this should improve the multi-threaded performance " "number of communication context per module to create. "
"when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL, "This should increase multithreaded performance but it is "
NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, "advised that this number should be lower than total cores.",
MCA_BASE_VAR_SCOPE_ALL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
&mca_btl_ofi_component.bind_threads_to_contexts); OPAL_INFO_LVL_5,
#endif MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.num_contexts_per_module);
disable_sep = false;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"disable_sep",
"force btl/ofi to never use scalable endpoint. ",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&disable_sep);
/* for now we want this component to lose to btl/ugni and btl/vader */ /* for now we want this component to lose to btl/ugni and btl/vader */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
@ -148,7 +157,6 @@ static int mca_btl_ofi_component_open(void)
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
/* /*
* component cleanup - sanity checking of queue lengths * component cleanup - sanity checking of queue lengths
*/ */
@ -289,21 +297,34 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
int rc; int rc;
int *module_count = &mca_btl_ofi_component.module_count; int *module_count = &mca_btl_ofi_component.module_count;
size_t namelen; size_t namelen;
mca_btl_ofi_module_t *module; size_t num_contexts_to_create;
char *linux_device_name; char *linux_device_name;
char ep_name[FI_NAME_MAX]; char ep_name[FI_NAME_MAX];
struct fi_info *ofi_info; struct fi_info *ofi_info;
struct fi_cq_attr cq_attr = {0}; struct fi_ep_attr *ep_attr;
struct fi_domain_attr *domain_attr;
struct fi_av_attr av_attr = {0}; struct fi_av_attr av_attr = {0};
struct fid_fabric *fabric = NULL; struct fid_fabric *fabric = NULL;
struct fid_domain *domain = NULL; struct fid_domain *domain = NULL;
struct fid_ep *endpoint = NULL; struct fid_ep *ep = NULL;
struct fid_cq *cq = NULL;
struct fid_av *av = NULL; struct fid_av *av = NULL;
mca_btl_ofi_module_t *module;
/* allocate module */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
if (NULL == module) {
BTL_ERROR(("failed to allocate memory for OFI module"));
goto fail;
}
*module = mca_btl_ofi_module_template;
/* make a copy of the given info to store on the module */ /* make a copy of the given info to store on the module */
ofi_info = fi_dupinfo(info); ofi_info = fi_dupinfo(info);
ep_attr = ofi_info->ep_attr;
domain_attr = ofi_info->domain_attr;
linux_device_name = info->domain_attr->name; linux_device_name = info->domain_attr->name;
BTL_VERBOSE(("initializing dev:%s provider:%s", BTL_VERBOSE(("initializing dev:%s provider:%s",
@ -330,28 +351,6 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
goto fail; goto fail;
} }
/* endpoint */
rc = fi_endpoint(domain, ofi_info, &endpoint, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
/* AV */ /* AV */
av_attr.type = FI_AV_MAP; av_attr.type = FI_AV_MAP;
rc = fi_av_open(domain, &av_attr, &av, NULL); rc = fi_av_open(domain, &av_attr, &av, NULL);
@ -363,29 +362,76 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
goto fail; goto fail;
} }
num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module;
/* bind CQ and AV to endpoint */ /* If the domain support scalable endpoint. */
uint32_t cq_flags = (FI_TRANSMIT); if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) {
rc = fi_ep_bind(endpoint, (fid_t)cq, cq_flags);
if (0 != rc) { BTL_VERBOSE(("btl/ofi using scalable endpoint."));
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name, if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) {
fi_strerror(-rc) BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)",
)); module->num_contexts,
goto fail; domain_attr->max_ep_tx_ctx));
goto fail;
}
/* modify the info to let the provider know we are creating x contexts */
ep_attr->tx_ctx_cnt = num_contexts_to_create;
ep_attr->rx_ctx_cnt = num_contexts_to_create;
/* create scalable endpoint */
rc = fi_scalable_ep(domain, ofi_info, &ep, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
module->num_contexts = num_contexts_to_create;
module->is_scalable_ep = true;
/* create contexts */
module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info,
domain, ep, av,
num_contexts_to_create);
} else {
/* warn the user if they want more than 1 context */
if (num_contexts_to_create > 1) {
BTL_ERROR(("cannot create %zu contexts as the provider does not support "
"scalable endpoint. Falling back to single context endpoint.",
num_contexts_to_create));
}
BTL_VERBOSE(("btl/ofi using normal endpoint."));
rc = fi_endpoint(domain, ofi_info, &ep, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail;
}
module->num_contexts = 1;
module->is_scalable_ep = false;
/* create contexts */
module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info,
domain, ep, av);
} }
rc = fi_ep_bind(endpoint, (fid_t)av, 0); if (NULL == module->contexts) {
if (0 != rc) { /* error message is already printed */
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto fail; goto fail;
} }
/* enable the endpoint for using */ /* enable the endpoint for using */
rc = fi_enable(endpoint); rc = fi_enable(ep);
if (0 != rc) { if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s", BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name, linux_device_name,
@ -395,19 +441,12 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
} }
/* Everything succeeded, lets create a module for this device. */ /* Everything succeeded, lets create a module for this device. */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
if (NULL == module) {
goto fail;
}
*module = mca_btl_ofi_module_template;
/* store the information. */ /* store the information. */
module->fabric_info = ofi_info; module->fabric_info = ofi_info;
module->fabric = fabric; module->fabric = fabric;
module->domain = domain; module->domain = domain;
module->cq = cq;
module->av = av; module->av = av;
module->ofi_endpoint = endpoint; module->ofi_endpoint = ep;
module->linux_device_name = linux_device_name; module->linux_device_name = linux_device_name;
module->outstanding_rdma = 0; module->outstanding_rdma = 0;
module->use_virt_addr = false; module->use_virt_addr = false;
@ -420,29 +459,13 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
/* initialize the rcache */ /* initialize the rcache */
mca_btl_ofi_rcache_init(module); mca_btl_ofi_rcache_init(module);
/* create endpoint list */
OBJ_CONSTRUCT(&module->endpoints, opal_list_t); OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t);
/* init free lists */
OBJ_CONSTRUCT(&module->comp_list, opal_free_list_t);
rc = opal_free_list_init(&module->comp_list,
sizeof(mca_btl_ofi_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_completion_t),
0,
0,
128,
-1,
128,
NULL,
0,
NULL,
NULL,
NULL);
assert(OPAL_SUCCESS == rc);
/* create and send the modex for this device */ /* create and send the modex for this device */
namelen = sizeof(ep_name); namelen = sizeof(ep_name);
rc = fi_getname((fid_t)endpoint, &ep_name[0], &namelen); rc = fi_getname((fid_t)ep, &ep_name[0], &namelen);
if (0 != rc) { if (0 != rc) {
BTL_VERBOSE(("%s failed fi_getname with err=%s", BTL_VERBOSE(("%s failed fi_getname with err=%s",
linux_device_name, linux_device_name,
@ -466,15 +489,20 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
fail: fail:
/* clean up */ /* clean up */
/* if the contexts have not been initiated, num_contexts should
* be zero and we skip this. */
for (int i=0; i < module->num_contexts; i++) {
mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep);
}
free(module->contexts);
if (NULL != av) { if (NULL != av) {
fi_close(&av->fid); fi_close(&av->fid);
} }
if (NULL != cq) {
fi_close(&cq->fid);
}
if (NULL != endpoint) { if (NULL != ep) {
fi_close(&endpoint->fid); fi_close(&ep->fid);
} }
if (NULL != domain) { if (NULL != domain) {
@ -484,12 +512,12 @@ fail:
if (NULL != fabric) { if (NULL != fabric) {
fi_close(&fabric->fid); fi_close(&fabric->fid);
} }
free(module);
/* not really a failure. just skip this device. */ /* not really a failure. just skip this device. */
return OPAL_ERR_OUT_OF_RESOURCE; return OPAL_ERR_OUT_OF_RESOURCE;
} }
/** /**
* @brief OFI BTL progress function * @brief OFI BTL progress function
* *
@ -497,6 +525,44 @@ fail:
*/ */
static int mca_btl_ofi_component_progress (void) static int mca_btl_ofi_component_progress (void)
{ {
int events = 0;
mca_btl_ofi_context_t *context;
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
/* progress context we own first. */
context = get_ofi_context(module);
if (mca_btl_ofi_context_trylock(context)) {
events += mca_btl_ofi_context_progress(context);
mca_btl_ofi_context_unlock(context);
}
/* if there is nothing to do, try progress other's. */
if (events == 0) {
for (int j = 0 ; j < module->num_contexts ; j++ ) {
context = get_ofi_context_rr(module);
if (mca_btl_ofi_context_trylock(context)) {
events += mca_btl_ofi_context_progress(context);
mca_btl_ofi_context_unlock(context);
}
/* If we did something, good enough. return now.
* This is crucial for performance/latency. */
if (events > 0) {
break;
}
}
}
}
return events;
}
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
int ret = 0; int ret = 0;
int events_read; int events_read;
@ -506,72 +572,66 @@ static int mca_btl_ofi_component_progress (void)
mca_btl_ofi_completion_t *comp; mca_btl_ofi_completion_t *comp;
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
ret = fi_cq_read(module->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); if (0 < ret) {
events_read = ret;
for (int i = 0; i < events_read; i++) {
if (NULL != cq_entry[i].op_context) {
++events;
comp = (mca_btl_ofi_completion_t*) cq_entry[i].op_context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl;
if (0 < ret) { switch (comp->type) {
events_read = ret; case MCA_BTL_OFI_TYPE_GET:
for (int j = 0; j < events_read; j++) { case MCA_BTL_OFI_TYPE_PUT:
if (NULL != cq_entry[j].op_context) { case MCA_BTL_OFI_TYPE_AOP:
++events; case MCA_BTL_OFI_TYPE_AFOP:
comp = (mca_btl_ofi_completion_t*) cq_entry[j].op_context; case MCA_BTL_OFI_TYPE_CSWAP:
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl;
switch (comp->type) { /* call the callback */
case MCA_BTL_OFI_TYPE_GET: if (comp->cbfunc) {
case MCA_BTL_OFI_TYPE_PUT: comp->cbfunc (comp->btl, comp->endpoint,
case MCA_BTL_OFI_TYPE_AOP: comp->local_address, comp->local_handle,
case MCA_BTL_OFI_TYPE_AFOP: comp->cbcontext, comp->cbdata, OPAL_SUCCESS);
case MCA_BTL_OFI_TYPE_CSWAP:
/* call the callback */
if (comp->cbfunc) {
comp->cbfunc (comp->btl, comp->endpoint,
comp->local_address, comp->local_handle,
comp->cbcontext, comp->cbdata, OPAL_SUCCESS);
}
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
} }
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
} }
} }
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(module->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
} }
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(context->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
}
#ifdef FI_EINTR #ifdef FI_EINTR
/* sometimes, sockets provider complain about interupt. */ /* sometimes, sockets provider complain about interupt. We do nothing. */
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
continue;
} }
#endif #endif
/* If the error is not FI_EAGAIN, report the error and abort. */ /* If the error is not FI_EAGAIN, report the error and abort. */
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
MCA_BTL_OFI_ABORT(); MCA_BTL_OFI_ABORT();
}
} }
return events; return events;
@ -593,5 +653,5 @@ mca_btl_ofi_component_t mca_btl_ofi_component = {
.btl_init = mca_btl_ofi_component_init, .btl_init = mca_btl_ofi_component_init,
.btl_progress = mca_btl_ofi_component_progress, .btl_progress = mca_btl_ofi_component_progress,
} },
}; };

Просмотреть файл

@ -15,6 +15,10 @@
#include "btl_ofi_endpoint.h" #include "btl_ofi_endpoint.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
#if OPAL_HAVE_THREAD_LOCAL
opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
#endif /* OPAL_HAVE_THREAD_LOCAL */
static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint) static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint)
{ {
endpoint->peer_addr = 0; endpoint->peer_addr = 0;
@ -49,3 +53,290 @@ mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct
return (mca_btl_base_endpoint_t *) endpoint; return (mca_btl_base_endpoint_t *) endpoint;
} }
int ofi_comp_list_init(opal_free_list_t *comp_list)
{
int rc;
OBJ_CONSTRUCT(comp_list, opal_free_list_t);
rc = opal_free_list_init(comp_list,
sizeof(mca_btl_ofi_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_completion_t),
0,
0,
128,
-1,
128,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
}
return rc;
}
/* mca_btl_ofi_context_alloc_normal()
*
* This function will allocate an ofi_context, map the endpoint to tx/rx context,
* bind CQ,AV to the endpoint and initialize all the structure.
* USE WITH NORMAL ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av)
{
int rc;
uint32_t cq_flags = FI_TRANSMIT;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
mca_btl_ofi_context_t *context;
context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
if (NULL == context) {
BTL_VERBOSE(("cannot allocate context"));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto single_fail;
}
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = ofi_comp_list_init(&context->comp_list);
if (rc != OPAL_SUCCESS) {
goto single_fail;
}
context->tx_ctx = ep;
context->rx_ctx = ep;
context->context_id = 0;
return context;
single_fail:
mca_btl_ofi_context_finalize(context, false);
return NULL;
}
/* mca_btl_ofi_context_alloc_scalable()
*
* This function allocate communication contexts and return the pointer
* to the first btl context. It also take care of all the bindings needed.
* USE WITH SCALABLE ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts)
{
BTL_VERBOSE(("creating %zu contexts", num_contexts));
int rc;
size_t i;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_rx_attr rx_attr = {0};
mca_btl_ofi_context_t *contexts;
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
if (NULL == contexts) {
BTL_VERBOSE(("cannot allocate communication contexts."));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto scalable_fail;
}
/* bind AV to endpoint */
rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
for (i=0; i < num_contexts; i++) {
rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* We don't actually need a receiving context as we only do one-sided.
* However, sockets provider will hang if we dont have one. It is
* also nice to have equal number of tx/rx context. */
rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* create CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to transmit context */
uint32_t cq_flags = (FI_TRANSMIT);
rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* enable the context. */
rc = fi_enable(contexts[i].tx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
rc = fi_enable(contexts[i].rx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* initialize completion freelist. */
rc = ofi_comp_list_init(&contexts[i].comp_list);
if (rc != OPAL_SUCCESS) {
goto scalable_fail;
}
/* assign the id */
contexts[i].context_id = i;
}
return contexts;
scalable_fail:
/* close and free */
for(i=0; i < num_contexts; i++) {
mca_btl_ofi_context_finalize(&contexts[i], true);
}
free(contexts);
return NULL;
}
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
/* if it is a scalable ep, we have to close all contexts. */
if (scalable_ep) {
if (NULL != context->tx_ctx) {
fi_close(&context->tx_ctx->fid);
}
if (NULL != context->rx_ctx) {
fi_close(&context->rx_ctx->fid);
}
}
if( NULL != context->cq) {
fi_close(&context->cq->fid);
}
/* Can we destruct the object that hasn't been constructed? */
OBJ_DESTRUCT(&context->comp_list);
}
/* Get a context to use for communication.
* If TLS is supported, it will use the cached endpoint.
* If not, it will invoke the normal round-robin assignment. */
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
{
#if OPAL_HAVE_THREAD_LOCAL
/* With TLS, we cache the context we use. */
static volatile int64_t cur_num = 0;
if (OPAL_UNLIKELY(my_context == NULL)) {
OPAL_THREAD_LOCK(&btl->module_lock);
my_context = &btl->contexts[cur_num];
cur_num = (cur_num + 1) %btl->num_contexts;
OPAL_THREAD_UNLOCK(&btl->module_lock);
}
assert (my_context);
return my_context;
#else
return get_ofi_context_rr(btl);
#endif
}
/* return the context in a round-robin. */
/* There is no need for atomics here as it might hurt the performance. */
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
{
static volatile uint64_t rr_num = 0;
return &btl->contexts[rr_num++%btl->num_contexts];
}

Просмотреть файл

@ -30,6 +30,10 @@
BEGIN_C_DECLS BEGIN_C_DECLS
#if OPAL_HAVE_THREAD_LOCAL
extern opal_thread_local mca_btl_ofi_context_t *my_context;
#endif /* OPAL_HAVE_THREAD_LOCAL */
struct mca_btl_base_endpoint_t { struct mca_btl_base_endpoint_t {
opal_list_item_t super; opal_list_item_t super;
@ -47,7 +51,25 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t; typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t); OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t);
int ofi_comp_list_init(opal_free_list_t *comp_list);
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep); mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep);
/* contexts */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts);
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av);
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep);
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl);
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl);
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -242,14 +242,17 @@ int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
{ {
int i;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *endpoint, *next; mca_btl_ofi_endpoint_t *endpoint, *next;
assert(btl); assert(btl);
if (NULL != ofi_btl->cq) { /* loop over all the contexts */
fi_close(&ofi_btl->cq->fid); for (i=0; i < ofi_btl->num_contexts; i++) {
mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
} }
free(ofi_btl->contexts);
if (NULL != ofi_btl->av) { if (NULL != ofi_btl->av) {
fi_close(&ofi_btl->av->fid); fi_close(&ofi_btl->av->fid);
@ -278,7 +281,6 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
} }
OBJ_DESTRUCT(&ofi_btl->endpoints); OBJ_DESTRUCT(&ofi_btl->endpoints);
OBJ_DESTRUCT(&ofi_btl->comp_list);
if (ofi_btl->rcache) { if (ofi_btl->rcache) {
mca_rcache_base_module_destroy (ofi_btl->rcache); mca_rcache_base_module_destroy (ofi_btl->rcache);

Просмотреть файл

@ -21,26 +21,31 @@ OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t,
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
mca_btl_base_module_t *btl, mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint, mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
void *local_address, void *local_address,
mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata, void *cbcontext, void *cbdata,
int type) int type)
{ {
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)btl; assert(btl);
assert(endpoint);
assert(ofi_context);
mca_btl_ofi_completion_t *comp; mca_btl_ofi_completion_t *comp;
comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_btl->comp_list); comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_context->comp_list);
assert(comp); assert(comp);
comp->btl = btl; comp->btl = btl;
comp->endpoint = endpoint; comp->endpoint = endpoint;
comp->my_context = ofi_context;
comp->local_address = local_address; comp->local_address = local_address;
comp->local_handle = local_handle; comp->local_handle = local_handle;
comp->cbfunc = cbfunc; comp->cbfunc = cbfunc;
comp->cbcontext = cbcontext; comp->cbcontext = cbcontext;
comp->cbdata = cbdata; comp->cbdata = cbdata;
comp->my_list = &ofi_btl->comp_list; comp->my_list = &ofi_context->comp_list;
comp->type = type; comp->type = type;
return comp; return comp;
@ -53,12 +58,17 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
{ {
int rc; int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp; mca_btl_ofi_completion_t *comp;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */ /* create completion context */
comp = mca_btl_ofi_completion_alloc(btl, endpoint, comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address, local_address,
local_handle, local_handle,
cbfunc, cbcontext, cbdata, cbfunc, cbcontext, cbdata,
@ -67,7 +77,7 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
remote_address = (remote_address - (uint64_t) remote_handle->base_addr); remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* Remote write data across the wire */ /* Remote write data across the wire */
rc = fi_read(ofi_btl->ofi_endpoint, rc = fi_read(ofi_context->tx_ctx,
local_address, size, /* payload */ local_address, size, /* payload */
local_handle->desc, local_handle->desc,
btl_endpoint->peer_addr, btl_endpoint->peer_addr,
@ -99,10 +109,14 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
int rc; int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */ /* create completion context */
mca_btl_ofi_completion_t *comp; mca_btl_ofi_completion_t *comp;
comp = mca_btl_ofi_completion_alloc(btl, endpoint, comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address, local_address,
local_handle, local_handle,
cbfunc, cbcontext, cbdata, cbfunc, cbcontext, cbdata,
@ -111,7 +125,7 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
remote_address = (remote_address - (uint64_t) remote_handle->base_addr); remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
/* Remote write data across the wire */ /* Remote write data across the wire */
rc = fi_write(ofi_btl->ofi_endpoint, rc = fi_write(ofi_context->tx_ctx,
local_address, size, /* payload */ local_address, size, /* payload */
local_handle->desc, local_handle->desc,
btl_endpoint->peer_addr, btl_endpoint->peer_addr,

Просмотреть файл

@ -22,6 +22,7 @@
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
mca_btl_base_module_t *btl, mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint, mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
void *local_address, void *local_address,
mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc, mca_btl_base_rdma_completion_fn_t cbfunc,