1
1

btl/ofi: Added 2 side communication support.

The 2 sided communication support is added for non-tagmatching provider
to take advantage of this BTL and PML OB1. The current state is
"functional" and not optimized for performance.

Two sided support is disabled by default and can be turned on by mca
parameter: "mca_btl_ofi_mode".

Signed-off-by: Thananon Patinyasakdikul <thananon.patinyasakdikul@intel.com>
(cherry picked from commit 080115d44069e0c461a1af105cd41f28849cdffc)
Signed-off-by: Brian Barrett <bbarrett@amazon.com>
Этот коммит содержится в:
Thananon Patinyasakdikul 2018-08-03 12:30:03 -07:00 коммит произвёл Brian Barrett
родитель 02ac75434a
Коммит f9439c6d18
13 изменённых файлов: 1157 добавлений и 500 удалений

Просмотреть файл

@ -31,7 +31,10 @@ sources = \
btl_ofi_module.c \
btl_ofi_rdma.h \
btl_ofi_rdma.c \
btl_ofi_atomics.c
btl_ofi_atomics.c \
btl_ofi_frag.c \
btl_ofi_frag.h \
btl_ofi_context.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -86,3 +86,25 @@ Known Problems:
- sockets provider uses progress thread and can cause segfault in finalize as we free
the resources while progress thread is still using it. sleep(1) was put in
mca_btl_ofi_componenet_close() for this reason.
- sockets provider deadlock in two-sided mode. Might be something about buffered recv.
(August 2018).
========================================
Scalable Endpoint
This BTL will try to use scalable endpoint to create communication context. This will increase
multithreaded performance for some application. The default number of context created is 1 and
can be tuned VIA MCA parameter "btl_ofi_num_contexts_per_module". It is advised that the number
of context should be equal to number of physical core for optimal performance.
User can disable scalable endpoint by MCA parameter "btl_ofi_disable_sep".
With scalable endpoint disbled, the BTL will alias OFI endpoint to both tx and rx context.
========================================
Two sided communication
Two sided communication is added later on to BTL OFI to enable non tag-matching provider
to be able to use in Open MPI with this BTL. However, the support is only for "functional"
and has not been optimized for performance at this point. (August 2018)

Просмотреть файл

@ -38,6 +38,8 @@
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/class/opal_hash_table.h"
#include <rdma/fabric.h>
#include <rdma/fi_domain.h>
#include <rdma/fi_errno.h>
@ -47,18 +49,31 @@
BEGIN_C_DECLS
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
#define MCA_BTL_OFI_NUM_CQE_READ 64
#define MCA_BTL_OFI_PROGRESS_THRESHOLD 64
#define MCA_BTL_OFI_DEFAULT_RD_NUM 10
#define MCA_BTL_OFI_DEFAULT_MAX_CQE 128
#define MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD 64
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
enum mca_btl_ofi_type {
MCA_BTL_OFI_TYPE_PUT = 1,
#define TWO_SIDED_ENABLED mca_btl_ofi_component.two_sided_enabled
enum mca_btl_ofi_mode {
MCA_BTL_OFI_MODE_ONE_SIDED = 0,
MCA_BTL_OFI_MODE_TWO_SIDED,
MCA_BTL_OFI_MODE_FULL_SUPPORT,
MCA_BTL_OFI_MODE_TOTAL
};
enum mca_btl_ofi_hdr_type {
MCA_BTL_OFI_TYPE_PUT = 0,
MCA_BTL_OFI_TYPE_GET,
MCA_BTL_OFI_TYPE_AOP,
MCA_BTL_OFI_TYPE_AFOP,
MCA_BTL_OFI_TYPE_CSWAP,
MCA_BTL_OFI_TYPE_SEND,
MCA_BTL_OFI_TYPE_RECV,
MCA_BTL_OFI_TYPE_TOTAL
};
@ -75,7 +90,9 @@ struct mca_btl_ofi_context_t {
/* completion info freelist */
/* We have it per context to reduce the thread contention
* on the freelist. Things can get really slow. */
opal_free_list_t comp_list;
opal_free_list_t rdma_comp_list;
opal_free_list_t frag_comp_list;
opal_free_list_t frag_list;
/* for thread locking */
volatile int32_t lock;
@ -107,12 +124,14 @@ struct mca_btl_ofi_module_t {
bool is_scalable_ep;
int64_t outstanding_rdma;
int64_t outstanding_send;
/** linked list of BTL endpoints. this list is never searched so
* there is no need for a complicated structure here at this time*/
opal_list_t endpoints;
opal_mutex_t module_lock;
opal_hash_table_t id_to_endpoint;
/** registration cache */
mca_rcache_base_module_t *rcache;
@ -132,6 +151,9 @@ struct mca_btl_ofi_component_t {
int num_contexts_per_module;
int num_cqe_read;
int progress_threshold;
int mode;
int rd_num;
bool two_sided_enabled;
size_t namelen;
@ -160,32 +182,73 @@ typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t);
struct mca_btl_ofi_header_t {
mca_btl_base_tag_t tag;
size_t len;
};
typedef struct mca_btl_ofi_header_t mca_btl_ofi_header_t;
struct mca_btl_ofi_base_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segments[2];
int context_id;
struct mca_btl_ofi_module_t *btl;
struct mca_btl_base_endpoint_t *endpoint;
opal_free_list_t *free_list;
mca_btl_ofi_header_t hdr;
};
typedef struct mca_btl_ofi_base_frag_t mca_btl_ofi_base_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_base_frag_t);
struct mca_btl_ofi_completion_context_t {
struct fi_context ctx;
void *comp;
};
typedef struct mca_btl_ofi_completion_context_t mca_btl_ofi_completion_context_t;
/* completion structure store information needed
* for RDMA callbacks */
struct mca_btl_ofi_completion_t {
struct mca_btl_ofi_base_completion_t {
opal_free_list_item_t comp_list;
opal_free_list_t *my_list;
struct mca_btl_base_module_t *btl;
struct mca_btl_base_endpoint_t *endpoint;
struct mca_btl_ofi_context_t *my_context;
uint32_t type;
int type;
};
typedef struct mca_btl_ofi_base_completion_t mca_btl_ofi_base_completion_t;
struct mca_btl_ofi_rdma_completion_t {
mca_btl_ofi_base_completion_t base;
mca_btl_ofi_completion_context_t comp_ctx;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
/* information for atomic op */
uint64_t operand;
uint64_t compare;
mca_btl_base_rdma_completion_fn_t cbfunc;
void *cbcontext;
void *cbdata;
};
typedef struct mca_btl_ofi_completion_t mca_btl_ofi_completion_t;
typedef struct mca_btl_ofi_rdma_completion_t mca_btl_ofi_rdma_completion_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_completion_t);
struct mca_btl_ofi_frag_completion_t {
mca_btl_ofi_base_completion_t base;
mca_btl_ofi_completion_context_t comp_ctx;
mca_btl_ofi_base_frag_t *frag;
};
typedef struct mca_btl_ofi_frag_completion_t mca_btl_ofi_frag_completion_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_rdma_completion_t);
OBJ_CLASS_DECLARATION(mca_btl_ofi_frag_completion_t);
/**
* Initiate an asynchronous put.
@ -288,6 +351,10 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode);
int mca_btl_ofi_post_recvs(mca_btl_base_module_t* module, mca_btl_ofi_context_t *context, int count);
void mca_btl_ofi_exit(void);
/* thread atomics */

Просмотреть файл

@ -42,7 +42,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_rdma_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
@ -53,12 +53,12 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
fi_op = to_fi_op(op);
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_AFOP);
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_AFOP);
/* copy the operand because it might get freed from upper layer */
comp->operand = (uint64_t) operand;
@ -70,7 +70,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
local_address, local_handle->desc, /* results */
btl_endpoint->peer_addr, /* remote addr */
remote_address, remote_handle->rkey, /* remote buffer */
fi_datatype, fi_op, comp);
fi_datatype, fi_op, &comp->comp_ctx);
if (rc == -FI_EAGAIN) {
return OPAL_ERR_OUT_OF_RESOURCE;
@ -95,7 +95,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_rdma_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
@ -106,12 +106,12 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
fi_op = to_fi_op(op);
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
NULL,
NULL,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_AOP);
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
NULL,
NULL,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_AOP);
/* copy the operand because it might get freed from upper layer */
comp->operand = (uint64_t) operand;
@ -122,7 +122,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
(void*) &comp->operand, 1, NULL, /* operand */
btl_endpoint->peer_addr, /* remote addr */
remote_address, remote_handle->rkey, /* remote buffer */
fi_datatype, fi_op, comp);
fi_datatype, fi_op, &comp->comp_ctx);
if (rc == -FI_EAGAIN) {
return OPAL_ERR_OUT_OF_RESOURCE;
@ -144,9 +144,10 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
int rc;
int fi_datatype = FI_UINT64;
mca_btl_ofi_rdma_completion_t *comp = NULL;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp = NULL;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
@ -155,12 +156,12 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
fi_datatype = FI_UINT32;
}
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_CSWAP);
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_CSWAP);
/* copy the operand because it might get freed from upper layer */
comp->operand = (uint64_t) value;
@ -177,7 +178,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
remote_address, remote_handle->rkey,
fi_datatype,
FI_CSWAP,
comp);
&comp->comp_ctx);
if (rc == -FI_EAGAIN) {
return OPAL_ERR_OUT_OF_RESOURCE;

Просмотреть файл

@ -33,26 +33,28 @@
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "btl_ofi_rdma.h"
#include "btl_ofi_frag.h"
#define MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
#define MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS (FI_MSG)
#define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
static char *prov_include;
static char *prov_exclude;
static char *ofi_progress_mode;
static bool disable_sep;
static int mca_btl_ofi_init_device(struct fi_info *info);
/* validate information returned from fi_getinfo().
* return OPAL_ERROR if we dont have what we need. */
static int validate_info(struct fi_info *info)
static int validate_info(struct fi_info *info, uint64_t required_caps)
{
int mr_mode;
BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
/* we need exactly all the required bits */
if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) {
if ((info->caps & required_caps) != required_caps) {
BTL_VERBOSE(("unsupported caps"));
return OPAL_ERROR;
}
@ -83,8 +85,27 @@ static int validate_info(struct fi_info *info)
/* Register the MCA parameters */
static int mca_btl_ofi_component_register(void)
{
char *msg;
mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template;
asprintf(&msg, "BTL OFI mode of operation. Valid values are: %d = One-Sided only, %d=Two-Sided only, "
"%d = Both one and two sided. BTL OFI is only optimized for one-sided communication",
MCA_BTL_OFI_MODE_ONE_SIDED,
MCA_BTL_OFI_MODE_TWO_SIDED,
MCA_BTL_OFI_MODE_FULL_SUPPORT);
if (NULL == msg) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
mca_btl_ofi_component.mode = MCA_BTL_OFI_MODE_ONE_SIDED;
(void)mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"mode",
msg,
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.mode);
/* fi_getinfo with prov_name == NULL means ALL provider.
* Since now we are using the first valid info returned, I'm not sure
* if we need to provide the support for comma limited provider list. */
@ -100,19 +121,6 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&prov_include);
/* TODO: this param has not been implemented. Not sure if we need it. " */
prov_exclude = NULL;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"provider_exclude",
"Comma-delimited list of OFI providers that are not considered for use "
"(default: \"sockets,mxm\"; empty value means that all providers will "
" be considered). "
"Mutually exclusive with btl_ofi_provider_include.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&prov_exclude);
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"num_cq_read",
@ -146,13 +154,13 @@ static int mca_btl_ofi_component_register(void)
disable_sep = false;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"disable_sep",
"force btl/ofi to never use scalable endpoint. ",
"force btl/ofi to never use scalable endpoint.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&disable_sep);
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_PROGRESS_THRESHOLD;
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"progress_threshold",
"number of outstanding operation before btl will progress "
@ -163,7 +171,17 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.progress_threshold);
/* for now we want this component to lose to btl/ugni and btl/vader */
mca_btl_ofi_component.rd_num = MCA_BTL_OFI_DEFAULT_RD_NUM;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"rd_num",
"Number of receive descriptor posted per context.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.rd_num);
/* for now we want this component to lose to the MTL. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
@ -226,6 +244,26 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
struct fi_tx_attr tx_attr = {0};
struct fi_fabric_attr fabric_attr = {0};
struct fi_domain_attr domain_attr = {0};
uint64_t required_caps;
switch (mca_btl_ofi_component.mode) {
case MCA_BTL_OFI_MODE_TWO_SIDED:
mca_btl_ofi_component.two_sided_enabled = true;
required_caps = MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
break;
case MCA_BTL_OFI_MODE_FULL_SUPPORT:
mca_btl_ofi_component.two_sided_enabled = true;
required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS |
MCA_BTL_OFI_TWO_SIDED_REQUIRED_CAPS;
break;
default:
/* default to only one sided. */
required_caps = MCA_BTL_OFI_ONE_SIDED_REQUIRED_CAPS;
break;
}
/* Select the provider */
fabric_attr.prov_name = prov_include;
@ -248,7 +286,9 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
ep_attr.type = FI_EP_RDM;
/* ask for capabilities */
hints.caps = MCA_BTL_OFI_REQUIRED_CAPS;
/* TODO: catch the caps here. */
hints.caps = required_caps;
hints.mode = FI_CONTEXT;
/* Ask for completion context */
hints.mode = FI_CONTEXT;
@ -285,7 +325,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
info = info_list;
while(info) {
rc = validate_info(info);
rc = validate_info(info, required_caps);
if (OPAL_SUCCESS == rc) {
/* Device passed sanity check, let's make a module.
* We only pick the first device we found valid */
@ -337,13 +377,15 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
mca_btl_ofi_module_t *module;
/* allocate module */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
module = mca_btl_ofi_module_alloc(mca_btl_ofi_component.mode);
if (NULL == module) {
BTL_ERROR(("failed to allocate memory for OFI module"));
BTL_VERBOSE(("failed allocating ofi module"));
goto fail;
}
*module = mca_btl_ofi_module_template;
/* If the user ask for two sided support, something bad is happening
* to the MTL, so we will take maximum priority to supersede the MTL. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
/* make a copy of the given info to store on the module */
ofi_info = fi_dupinfo(info);
@ -486,6 +528,13 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
/* create endpoint list */
OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t);
rc = opal_hash_table_init (&module->id_to_endpoint, 512);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("error initializing hash table."));
goto fail;
}
/* create and send the modex for this device */
namelen = sizeof(ep_name);
@ -498,6 +547,21 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
goto fail;
}
/* If we have two-sided support. */
if (TWO_SIDED_ENABLED) {
/* post wildcard recvs */
for (int i=0; i < module->num_contexts; i++) {
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) module,
&module->contexts[i],
mca_btl_ofi_component.rd_num);
if (OPAL_SUCCESS != rc) {
goto fail;
}
}
}
/* post our endpoint name so peer can use it to connect to us */
OPAL_MODEX_SEND(rc,
OPAL_PMIX_GLOBAL,
@ -586,81 +650,6 @@ static int mca_btl_ofi_component_progress (void)
return events;
}
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
int ret = 0;
int events_read;
int events = 0;
struct fi_cq_entry cq_entry[MCA_BTL_OFI_MAX_CQ_READ_ENTRIES];
struct fi_cq_err_entry cqerr = {0};
mca_btl_ofi_completion_t *comp;
ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
if (0 < ret) {
events_read = ret;
for (int i = 0; i < events_read; i++) {
if (NULL != cq_entry[i].op_context) {
++events;
comp = (mca_btl_ofi_completion_t*) cq_entry[i].op_context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl;
switch (comp->type) {
case MCA_BTL_OFI_TYPE_GET:
case MCA_BTL_OFI_TYPE_PUT:
case MCA_BTL_OFI_TYPE_AOP:
case MCA_BTL_OFI_TYPE_AFOP:
case MCA_BTL_OFI_TYPE_CSWAP:
/* call the callback */
if (comp->cbfunc) {
comp->cbfunc (comp->btl, comp->endpoint,
comp->local_address, comp->local_handle,
comp->cbcontext, comp->cbdata, OPAL_SUCCESS);
}
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
}
}
}
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(context->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
}
#ifdef FI_EINTR
/* sometimes, sockets provider complain about interupt. We do nothing. */
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
}
#endif
/* If the error is not FI_EAGAIN, report the error and abort. */
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
MCA_BTL_OFI_ABORT();
}
return events;
}
/** OFI btl component */
mca_btl_ofi_component_t mca_btl_ofi_component = {
.super = {

463
opal/mca/btl/ofi/btl_ofi_context.c Обычный файл
Просмотреть файл

@ -0,0 +1,463 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* $COPYRIGHT$
* Copyright (c) 2018 Intel Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi.h"
#include "btl_ofi_frag.h"
#include "btl_ofi_rdma.h"
#if OPAL_HAVE_THREAD_LOCAL
opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
#endif /* OPAL_HAVE_THREAD_LOCAL */
int init_context_freelists(mca_btl_ofi_context_t *context)
{
int rc;
OBJ_CONSTRUCT(&context->rdma_comp_list, opal_free_list_t);
rc = opal_free_list_init(&context->rdma_comp_list,
sizeof(mca_btl_ofi_rdma_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_rdma_completion_t),
0,
0,
512,
-1,
512,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
return rc;
}
if (TWO_SIDED_ENABLED) {
OBJ_CONSTRUCT(&context->frag_comp_list, opal_free_list_t);
rc = opal_free_list_init(&context->frag_comp_list,
sizeof(mca_btl_ofi_frag_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_frag_completion_t),
0,
0,
512,
-1,
512,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
return rc;
}
/* Initialize frag pool */
OBJ_CONSTRUCT(&context->frag_list, opal_free_list_t);
rc = opal_free_list_init(&context->frag_list,
sizeof(mca_btl_ofi_base_frag_t) +
MCA_BTL_OFI_FRAG_SIZE,
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_base_frag_t),
0,
0,
1024,
-1,
1024,
NULL,
0,
NULL,
NULL,
NULL);
if (OPAL_SUCCESS != rc) {
BTL_VERBOSE(("failed to init frag pool (free_list)"));
}
}
return rc;
}
/* mca_btl_ofi_context_alloc_normal()
*
* This function will allocate an ofi_context, map the endpoint to tx/rx context,
* bind CQ,AV to the endpoint and initialize all the structure.
* USE WITH NORMAL ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av)
{
int rc;
uint32_t cq_flags = FI_TRANSMIT | FI_SEND | FI_RECV;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
mca_btl_ofi_context_t *context;
context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
if (NULL == context) {
BTL_VERBOSE(("cannot allocate context"));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto single_fail;
}
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = init_context_freelists(context);
if (rc != OPAL_SUCCESS) {
goto single_fail;
}
context->tx_ctx = ep;
context->rx_ctx = ep;
context->context_id = 0;
return context;
single_fail:
mca_btl_ofi_context_finalize(context, false);
return NULL;
}
/* mca_btl_ofi_context_alloc_scalable()
*
* This function allocate communication contexts and return the pointer
* to the first btl context. It also take care of all the bindings needed.
* USE WITH SCALABLE ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts)
{
BTL_VERBOSE(("creating %zu contexts", num_contexts));
int rc;
size_t i;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_rx_attr rx_attr = {0};
mca_btl_ofi_context_t *contexts;
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
if (NULL == contexts) {
BTL_VERBOSE(("cannot allocate communication contexts."));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto scalable_fail;
}
/* bind AV to endpoint */
rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
for (i=0; i < num_contexts; i++) {
rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* We don't actually need a receiving context as we only do one-sided.
* However, sockets provider will hang if we dont have one. It is
* also nice to have equal number of tx/rx context. */
rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* create CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to transmit context */
rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, FI_TRANSMIT);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to receiving context */
if (TWO_SIDED_ENABLED) {
rc = fi_ep_bind(contexts[i].rx_ctx, (fid_t)contexts[i].cq, FI_RECV);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
}
/* enable the context. */
rc = fi_enable(contexts[i].tx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
rc = fi_enable(contexts[i].rx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* initialize freelists. */
rc = init_context_freelists(&contexts[i]);
if (rc != OPAL_SUCCESS) {
goto scalable_fail;
}
/* assign the id */
contexts[i].context_id = i;
}
return contexts;
scalable_fail:
/* close and free */
for(i=0; i < num_contexts; i++) {
mca_btl_ofi_context_finalize(&contexts[i], true);
}
free(contexts);
return NULL;
}
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
/* if it is a scalable ep, we have to close all contexts. */
if (scalable_ep) {
if (NULL != context->tx_ctx) {
fi_close(&context->tx_ctx->fid);
}
if (NULL != context->rx_ctx) {
fi_close(&context->rx_ctx->fid);
}
}
if( NULL != context->cq) {
fi_close(&context->cq->fid);
}
/* Can we destruct the object that hasn't been constructed? */
OBJ_DESTRUCT(&context->rdma_comp_list);
if (TWO_SIDED_ENABLED) {
OBJ_DESTRUCT(&context->frag_comp_list);
OBJ_DESTRUCT(&context->frag_list);
}
}
/* Get a context to use for communication.
* If TLS is supported, it will use the cached endpoint.
* If not, it will invoke the normal round-robin assignment. */
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
{
#if OPAL_HAVE_THREAD_LOCAL
/* With TLS, we cache the context we use. */
static volatile int64_t cur_num = 0;
if (OPAL_UNLIKELY(my_context == NULL)) {
OPAL_THREAD_LOCK(&btl->module_lock);
my_context = &btl->contexts[cur_num];
cur_num = (cur_num + 1) %btl->num_contexts;
OPAL_THREAD_UNLOCK(&btl->module_lock);
}
assert (my_context);
return my_context;
#else
return get_ofi_context_rr(btl);
#endif
}
/* return the context in a round-robin. */
/* There is no need for atomics here as it might hurt the performance. */
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
{
static volatile uint64_t rr_num = 0;
return &btl->contexts[rr_num++%btl->num_contexts];
}
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {
int ret = 0;
int events_read;
int events = 0;
struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE];
struct fi_cq_err_entry cqerr = {0};
mca_btl_ofi_completion_context_t *c_ctx;
mca_btl_ofi_base_completion_t *comp;
mca_btl_ofi_rdma_completion_t *rdma_comp;
mca_btl_ofi_frag_completion_t *frag_comp;
ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
if (0 < ret) {
events_read = ret;
for (int i = 0; i < events_read; i++) {
if (NULL != cq_entry[i].op_context) {
++events;
c_ctx = (mca_btl_ofi_completion_context_t*) cq_entry[i].op_context;
/* We are casting to every type here just for simplicity. */
comp = (mca_btl_ofi_base_completion_t*) c_ctx->comp;
frag_comp = (mca_btl_ofi_frag_completion_t*) c_ctx->comp;
rdma_comp = (mca_btl_ofi_rdma_completion_t*) c_ctx->comp;
switch (comp->type) {
case MCA_BTL_OFI_TYPE_GET:
case MCA_BTL_OFI_TYPE_PUT:
case MCA_BTL_OFI_TYPE_AOP:
case MCA_BTL_OFI_TYPE_AFOP:
case MCA_BTL_OFI_TYPE_CSWAP:
/* call the callback */
if (rdma_comp->cbfunc) {
rdma_comp->cbfunc (comp->btl, comp->endpoint,
rdma_comp->local_address, rdma_comp->local_handle,
rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS);
}
MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t*) comp->btl);
break;
case MCA_BTL_OFI_TYPE_RECV:
mca_btl_ofi_recv_frag((mca_btl_ofi_module_t*) comp->btl,
(mca_btl_ofi_endpoint_t*) comp->endpoint,
context, frag_comp->frag);
break;
case MCA_BTL_OFI_TYPE_SEND:
MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t*) comp->btl);
mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS);
break;
default:
/* catasthrophic */
BTL_ERROR(("unknown completion type"));
MCA_BTL_OFI_ABORT();
}
/* return the completion handler */
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
}
}
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
ret = fi_cq_readerr(context->cq, &cqerr, 0);
/* cq readerr failed!? */
if (0 > ret) {
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
__FILE__, __LINE__, fi_strerror(-ret), ret));
} else {
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
cqerr.prov_errno));
}
MCA_BTL_OFI_ABORT();
}
#ifdef FI_EINTR
/* sometimes, sockets provider complain about interupt. We do nothing. */
else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {
}
#endif
/* If the error is not FI_EAGAIN, report the error and abort. */
else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
MCA_BTL_OFI_ABORT();
}
return events;
}

Просмотреть файл

@ -15,10 +15,6 @@
#include "btl_ofi_endpoint.h"
#include "opal/util/proc.h"
#if OPAL_HAVE_THREAD_LOCAL
opal_thread_local mca_btl_ofi_context_t *my_context = NULL;
#endif /* OPAL_HAVE_THREAD_LOCAL */
static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint)
{
endpoint->peer_addr = 0;
@ -52,292 +48,3 @@ mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct
return (mca_btl_base_endpoint_t *) endpoint;
}
int ofi_comp_list_init(opal_free_list_t *comp_list)
{
int rc;
OBJ_CONSTRUCT(comp_list, opal_free_list_t);
rc = opal_free_list_init(comp_list,
sizeof(mca_btl_ofi_completion_t),
opal_cache_line_size,
OBJ_CLASS(mca_btl_ofi_completion_t),
0,
0,
128,
-1,
128,
NULL,
0,
NULL,
NULL,
NULL);
if (rc != OPAL_SUCCESS) {
BTL_VERBOSE(("cannot allocate completion freelist"));
}
return rc;
}
/* mca_btl_ofi_context_alloc_normal()
*
* This function will allocate an ofi_context, map the endpoint to tx/rx context,
* bind CQ,AV to the endpoint and initialize all the structure.
* USE WITH NORMAL ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *ep,
struct fid_av *av)
{
int rc;
uint32_t cq_flags = FI_TRANSMIT;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
mca_btl_ofi_context_t *context;
context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context));
if (NULL == context) {
BTL_VERBOSE(("cannot allocate context"));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto single_fail;
}
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto single_fail;
}
rc = ofi_comp_list_init(&context->comp_list);
if (rc != OPAL_SUCCESS) {
goto single_fail;
}
context->tx_ctx = ep;
context->rx_ctx = ep;
context->context_id = 0;
return context;
single_fail:
mca_btl_ofi_context_finalize(context, false);
return NULL;
}
/* mca_btl_ofi_context_alloc_scalable()
*
* This function allocate communication contexts and return the pointer
* to the first btl context. It also take care of all the bindings needed.
* USE WITH SCALABLE ENDPOINT ONLY */
mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fid_domain *domain,
struct fid_ep *sep,
struct fid_av *av,
size_t num_contexts)
{
BTL_VERBOSE(("creating %zu contexts", num_contexts));
int rc;
size_t i;
char *linux_device_name = info->domain_attr->name;
struct fi_cq_attr cq_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_rx_attr rx_attr = {0};
mca_btl_ofi_context_t *contexts;
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
if (NULL == contexts) {
BTL_VERBOSE(("cannot allocate communication contexts."));
return NULL;
}
/* Don't really need to check, just avoiding compiler warning because
* BTL_VERBOSE is a no op in performance build and the compiler will
* complain about unused variable. */
if (NULL == linux_device_name) {
BTL_VERBOSE(("linux device name is NULL. This shouldn't happen."));
goto scalable_fail;
}
/* bind AV to endpoint */
rc = fi_scalable_ep_bind(sep, (fid_t)av, 0);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
for (i=0; i < num_contexts; i++) {
rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_tx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* We don't actually need a receiving context as we only do one-sided.
* However, sockets provider will hang if we dont have one. It is
* also nice to have equal number of tx/rx context. */
rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_rx_context with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* create CQ */
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
cq_attr.wait_obj = FI_WAIT_NONE;
rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* bind cq to transmit context */
uint32_t cq_flags = (FI_TRANSMIT);
rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* enable the context. */
rc = fi_enable(contexts[i].tx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
rc = fi_enable(contexts[i].rx_ctx);
if (0 != rc) {
BTL_VERBOSE(("%s failed fi_enable with err=%s",
linux_device_name,
fi_strerror(-rc)
));
goto scalable_fail;
}
/* initialize completion freelist. */
rc = ofi_comp_list_init(&contexts[i].comp_list);
if (rc != OPAL_SUCCESS) {
goto scalable_fail;
}
/* assign the id */
contexts[i].context_id = i;
}
return contexts;
scalable_fail:
/* close and free */
for(i=0; i < num_contexts; i++) {
mca_btl_ofi_context_finalize(&contexts[i], true);
}
free(contexts);
return NULL;
}
void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) {
/* if it is a scalable ep, we have to close all contexts. */
if (scalable_ep) {
if (NULL != context->tx_ctx) {
fi_close(&context->tx_ctx->fid);
}
if (NULL != context->rx_ctx) {
fi_close(&context->rx_ctx->fid);
}
}
if( NULL != context->cq) {
fi_close(&context->cq->fid);
}
/* Can we destruct the object that hasn't been constructed? */
OBJ_DESTRUCT(&context->comp_list);
}
/* Get a context to use for communication.
* If TLS is supported, it will use the cached endpoint.
* If not, it will invoke the normal round-robin assignment. */
mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl)
{
#if OPAL_HAVE_THREAD_LOCAL
/* With TLS, we cache the context we use. */
static volatile int64_t cur_num = 0;
if (OPAL_UNLIKELY(my_context == NULL)) {
OPAL_THREAD_LOCK(&btl->module_lock);
my_context = &btl->contexts[cur_num];
cur_num = (cur_num + 1) %btl->num_contexts;
OPAL_THREAD_UNLOCK(&btl->module_lock);
}
assert (my_context);
return my_context;
#else
return get_ofi_context_rr(btl);
#endif
}
/* return the context in a round-robin. */
/* There is no need for atomics here as it might hurt the performance. */
mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
{
static volatile uint64_t rr_num = 0;
return &btl->contexts[rr_num++%btl->num_contexts];
}

Просмотреть файл

@ -51,7 +51,7 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t);
int ofi_comp_list_init(opal_free_list_t *comp_list);
int init_context_freelists(mca_btl_ofi_context_t *context);
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep);

198
opal/mca/btl/ofi/btl_ofi_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,198 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* $COPYRIGHT$
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Intel Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ofi.h"
#include "btl_ofi_frag.h"
#include "btl_ofi_rdma.h"
#include "btl_ofi_endpoint.h"
static void mca_btl_ofi_base_frag_constructor (mca_btl_ofi_base_frag_t *frag)
{
/* zero everything out */
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
}
static void mca_btl_ofi_base_frag_destructor (mca_btl_ofi_base_frag_t *frag)
{
}
OBJ_CLASS_INSTANCE(mca_btl_ofi_base_frag_t,
mca_btl_base_descriptor_t,
mca_btl_ofi_base_frag_constructor,
mca_btl_ofi_base_frag_destructor);
OBJ_CLASS_INSTANCE(mca_btl_ofi_frag_completion_t,
opal_free_list_item_t,
NULL,
NULL);
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
(mca_btl_base_module_t *btl,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag,
int type)
{
mca_btl_ofi_frag_completion_t *comp;
comp = (mca_btl_ofi_frag_completion_t*) opal_free_list_get(&context->frag_comp_list);
comp->base.btl = btl;
comp->base.my_context = context;
comp->base.my_list = &context->frag_comp_list;
comp->base.type = type;
comp->frag = frag;
comp->comp_ctx.comp = comp;
return comp;
}
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
uint64_t order, size_t size, uint32_t flags)
{
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
mca_btl_ofi_base_frag_t *frag = NULL;
mca_btl_ofi_context_t *context = get_ofi_context(ofi_btl);
frag = mca_btl_ofi_frag_alloc(ofi_btl, &context->frag_list, endpoint);
if (OPAL_LIKELY(frag)) {
frag->segments[0].seg_addr.pval = frag + 1;
frag->segments[0].seg_len = size;
frag->base.des_segment_count = 1;
frag->base.des_segments = &frag->segments[0];
frag->base.des_flags = flags;
frag->base.order = order;
frag->hdr.len = size;
}
return (mca_btl_base_descriptor_t*) frag;
}
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
{
/* return the frag to the free list. */
mca_btl_ofi_frag_return ((mca_btl_ofi_base_frag_t*) des);
return OPAL_SUCCESS;
}
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
int rc = 0;
mca_btl_ofi_context_t *context;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*) btl;
mca_btl_ofi_endpoint_t *ofi_ep = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t*) descriptor;
mca_btl_ofi_frag_completion_t *comp;
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
/* This tag is the active message tag for the remote side */
frag->hdr.tag = tag;
/* create completion context */
context = get_ofi_context(ofi_btl);
comp = mca_btl_ofi_frag_completion_alloc(btl, context, frag,
MCA_BTL_OFI_TYPE_SEND);
/* send the frag. Note that we start sending from BTL header + payload
* because we need the other side to have this header information. */
rc = fi_send(context->tx_ctx,
&frag->hdr,
sizeof(mca_btl_ofi_header_t) + frag->hdr.len,
NULL,
ofi_ep->peer_addr,
&comp->comp_ctx);
if (OPAL_UNLIKELY(FI_SUCCESS != rc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
MCA_BTL_OFI_NUM_SEND_INC(ofi_btl);
return OPAL_SUCCESS;
}
inline int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag)
{
int rc;
mca_btl_active_message_callback_t *reg;
/* Tell PML where the payload is */
frag->base.des_segments = frag->segments;
frag->segments[0].seg_addr.pval = frag+1;
frag->segments[0].seg_len = frag->hdr.len;
frag->base.des_segment_count = 1;
/* call the callback */
reg = mca_btl_base_active_message_trigger + frag->hdr.tag;
reg->cbfunc (&ofi_btl->super, frag->hdr.tag, &frag->base, reg->cbdata);
mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS);
/* repost the recv */
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t*) ofi_btl, context, 1);
if (OPAL_SUCCESS != rc) {
/* might not be that bad but let's just fail here. */
BTL_ERROR(("failed reposting receive."));
MCA_BTL_OFI_ABORT();
}
return OPAL_SUCCESS;
}
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
opal_convertor_t *convertor,
uint8_t order, size_t reserve,
size_t *size, uint32_t flags)
{
struct iovec iov;
size_t length;
uint32_t iov_count = 1;
mca_btl_ofi_base_frag_t *frag;
/* allocate the frag with reserve. */
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(btl, endpoint,
order, reserve, flags);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/* pack the data after the reserve */
iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag->segments[0].seg_addr.pval)) + reserve);
opal_convertor_pack(convertor, &iov, &iov_count, &length);
/* pass on frag information */
frag->base.des_segments = frag->segments;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
frag->segments[0].seg_len += length;
frag->hdr.len += length;
*size = length;
return &frag->base;
}

95
opal/mca/btl/ofi/btl_ofi_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,95 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(MCA_BTL_OFI_FRAG_H)
#define MCA_BTL_OFI_FRAG_H
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#define MCA_BTL_OFI_HDR_SIZE sizeof(mca_btl_ofi_header_t)
#define MCA_BTL_OFI_FRAG_SIZE 4096
#define MCA_BTL_OFI_RECV_SIZE MCA_BTL_OFI_FRAG_SIZE + MCA_BTL_OFI_HDR_SIZE
#define MCA_BTL_OFI_NUM_SEND_INC(module) \
OPAL_ATOMIC_ADD_FETCH64(&(module)->outstanding_send, 1); \
if (module->outstanding_send > mca_btl_ofi_component.progress_threshold) { \
mca_btl_ofi_component.super.btl_progress(); \
}
#define MCA_BTL_OFI_NUM_SEND_DEC(module) \
OPAL_ATOMIC_ADD_FETCH64(&(module)->outstanding_send, -1);
mca_btl_base_descriptor_t *mca_btl_ofi_alloc(
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
uint64_t order, size_t size, uint32_t flags);
int mca_btl_ofi_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des);
int mca_btl_ofi_send (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag);
int mca_btl_ofi_recv_frag (mca_btl_ofi_module_t *ofi_btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag);
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
opal_convertor_t *convertor,
uint8_t order, size_t reserve,
size_t *size, uint32_t flags);
mca_btl_ofi_frag_completion_t *mca_btl_ofi_frag_completion_alloc
(mca_btl_base_module_t *btl,
mca_btl_ofi_context_t *context,
mca_btl_ofi_base_frag_t *frag,
int type);
static inline mca_btl_ofi_base_frag_t *mca_btl_ofi_frag_alloc (mca_btl_ofi_module_t *ofi_btl, opal_free_list_t *fl,
mca_btl_base_endpoint_t *endpoint)
{
mca_btl_ofi_base_frag_t *frag = (mca_btl_ofi_base_frag_t *) opal_free_list_get (fl);
if (OPAL_LIKELY(NULL != frag)) {
frag->free_list = fl;
frag->endpoint = endpoint;
frag->btl = ofi_btl;
}
return frag;
}
static inline void mca_btl_ofi_frag_return (mca_btl_ofi_base_frag_t *frag)
{
opal_free_list_return (frag->free_list, &frag->base.super);
}
static inline void mca_btl_ofi_frag_complete (mca_btl_ofi_base_frag_t *frag, int rc) {
mca_btl_ofi_module_t *ofi_btl = frag->btl;
/* call the local callback if specified */
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
frag->base.des_cbfunc(&ofi_btl->super, frag->endpoint, &frag->base, rc);
}
/* If the BTL has ownership, return it to the free list, */
if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
mca_btl_ofi_frag_return (frag);
}
}
#endif /* !defined(MCA_BTL_OFI_FRAG_H) */

Просмотреть файл

@ -31,6 +31,7 @@
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
#include "btl_ofi_frag.h"
static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
size_t nprocs, opal_proc_t **opal_procs,
@ -42,12 +43,33 @@ static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
char *ep_name = NULL;
size_t namelen = mca_btl_ofi_component.namelen;
opal_proc_t *proc;
mca_btl_base_endpoint_t *ep;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
for (size_t i = 0 ; i < nprocs ; ++i) {
peers[i] = mca_btl_ofi_endpoint_create (opal_procs[i], ofi_btl->ofi_endpoint);
if (OPAL_UNLIKELY(NULL == peers[i])) {
return OPAL_ERR_OUT_OF_RESOURCE;
proc = opal_procs[i];
/* See if we already have an endpoint for this proc. */
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void **) &ep);
if (OPAL_SUCCESS == rc) {
BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name)));
peers[i] = ep;
} else {
/* We don't have this endpoint yet, create one */
peers[i] = mca_btl_ofi_endpoint_create (proc, ofi_btl->ofi_endpoint);
BTL_VERBOSE(("creating peer %p", peers[i]));
if (OPAL_UNLIKELY(NULL == peers[i])) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* Add this endpoint to the lookup table */
(void) opal_hash_table_set_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void**) &ep);
}
OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version,
@ -81,24 +103,29 @@ static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs,
opal_proc_t **procs, mca_btl_base_endpoint_t **peers)
{
int ret;
int rc;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_base_endpoint_t *ep;
for (size_t i = 0 ; i < nprocs ; ++i) {
if (peers[i]) {
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i], (void **) &ep);
/* remove the address from AV. */
ret = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
if (ret < 0) {
/* remove failed. this should not happen. */
/* Lets not crash because we failed to remove an address. */
BTL_ERROR(("fi_av_remove failed with error %d:%s",
ret, fi_strerror(-ret)));
}
if (OPAL_SUCCESS == rc) {
/* remove the address from AV. */
rc = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
if (rc < 0) {
/* remove failed. this should not happen. */
/* Lets not crash because we failed to remove an address. */
BTL_ERROR(("fi_av_remove failed with error %d:%s",
rc, fi_strerror(-rc)));
}
/* remove and free MPI endpoint from the list. */
opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
OBJ_RELEASE(peers[i]);
/* remove and free MPI endpoint from the list. */
opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
(void) opal_hash_table_remove_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i]);
OBJ_RELEASE(peers[i]);
}
}
}
@ -281,6 +308,8 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
}
OBJ_DESTRUCT(&ofi_btl->endpoints);
OBJ_DESTRUCT(&ofi_btl->id_to_endpoint);
OBJ_DESTRUCT(&ofi_btl->module_lock);
if (ofi_btl->rcache) {
mca_rcache_base_module_destroy (ofi_btl->rcache);
@ -291,39 +320,119 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
return OPAL_SUCCESS;
}
/* Post wildcard recvs on the rx context. */
int mca_btl_ofi_post_recvs (mca_btl_base_module_t *module,
mca_btl_ofi_context_t *context,
int count)
{
int i;
int rc;
mca_btl_ofi_base_frag_t *frag;
mca_btl_ofi_frag_completion_t *comp;
for (i=0; i < count; i++) {
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(module,
NULL,
0,
MCA_BTL_OFI_FRAG_SIZE,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (NULL == frag) {
BTL_ERROR(("cannot allocate recv frag."));
return OPAL_ERROR;
}
comp = mca_btl_ofi_frag_completion_alloc (module,
context,
frag,
MCA_BTL_OFI_TYPE_RECV);
rc = fi_recv (context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE,
NULL, FI_ADDR_UNSPEC, &comp->comp_ctx);
if (FI_SUCCESS != rc) {
BTL_ERROR(("cannot post recvs"));
return OPAL_ERROR;
}
}
return OPAL_SUCCESS;
}
/* Allocate and fill out the module capabilities according to operation mode. */
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode)
{
mca_btl_ofi_module_t *module;
/* allocate module */
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
if (NULL == module) {
return NULL;
}
/* fill in the defaults */
*module = mca_btl_ofi_module_template;
if (mode == MCA_BTL_OFI_MODE_ONE_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
module->super.btl_put = mca_btl_ofi_put;
module->super.btl_get = mca_btl_ofi_get;
module->super.btl_atomic_op = mca_btl_ofi_aop;
module->super.btl_atomic_fop = mca_btl_ofi_afop;
module->super.btl_atomic_cswap = mca_btl_ofi_acswap;
module->super.btl_flush = mca_btl_ofi_flush;
module->super.btl_register_mem = mca_btl_ofi_register_mem;
module->super.btl_deregister_mem = mca_btl_ofi_deregister_mem;
module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS |
MCA_BTL_FLAGS_ATOMIC_OPS |
MCA_BTL_FLAGS_RDMA;
module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_SWAP |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
MCA_BTL_ATOMIC_SUPPORTS_32BIT ;
module->super.btl_put_limit = 1 << 23;
module->super.btl_put_alignment = 0;
module->super.btl_get_limit = 1 << 23;
module->super.btl_get_alignment = 0;
module->super.btl_registration_handle_size =
sizeof(mca_btl_base_registration_handle_t);
}
if (mode == MCA_BTL_OFI_MODE_TWO_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
module->super.btl_alloc = mca_btl_ofi_alloc;
module->super.btl_free = mca_btl_ofi_free;
module->super.btl_prepare_src = mca_btl_ofi_prepare_src;
module->super.btl_send = mca_btl_ofi_send;
module->super.btl_flags |= MCA_BTL_FLAGS_SEND;
module->super.btl_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
module->super.btl_max_send_size = MCA_BTL_OFI_FRAG_SIZE;
module->super.btl_rndv_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
/* If two sided is enabled, we expected that the user knows exactly what
* they want. We bump the priority to maximum, making this BTL the default. */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
}
if (mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
module->super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
module->super.btl_rdma_pipeline_send_length = 8 * 1024;
}
return module;
}
mca_btl_ofi_module_t mca_btl_ofi_module_template = {
.super = {
/* initialize functions. this btl only support RDMA and atomics
* for now so it does not provide prepare_src, alloc, free, or send */
.btl_component = &mca_btl_ofi_component.super,
.btl_add_procs = mca_btl_ofi_add_procs,
.btl_del_procs = mca_btl_ofi_del_procs,
.btl_finalize = mca_btl_ofi_finalize,
.btl_put = mca_btl_ofi_put,
.btl_get = mca_btl_ofi_get,
.btl_register_mem = mca_btl_ofi_register_mem,
.btl_deregister_mem = mca_btl_ofi_deregister_mem,
.btl_atomic_op = mca_btl_ofi_aop,
.btl_atomic_fop = mca_btl_ofi_afop,
.btl_atomic_cswap = mca_btl_ofi_acswap,
.btl_flush = mca_btl_ofi_flush,
/* set the default flags for this btl. ofi provides us with rdma and both
* fetching and non-fetching atomics (though limited to add and cswap) */
.btl_flags = MCA_BTL_FLAGS_RDMA |
MCA_BTL_FLAGS_ATOMIC_FOPS |
MCA_BTL_FLAGS_ATOMIC_OPS,
.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
MCA_BTL_ATOMIC_SUPPORTS_SWAP |
MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
MCA_BTL_ATOMIC_SUPPORTS_32BIT,
/* set the default limits on put and get */
.btl_registration_handle_size = sizeof(mca_btl_base_registration_handle_t),
.btl_put_limit = 1 << 23,
.btl_put_alignment = 0,
.btl_get_limit = 1 << 23,
.btl_get_alignment = 0,
}
};

Просмотреть файл

@ -13,12 +13,12 @@
#include "btl_ofi_rdma.h"
OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t,
OBJ_CLASS_INSTANCE(mca_btl_ofi_rdma_completion_t,
opal_free_list_item_t,
NULL,
NULL);
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
mca_btl_ofi_rdma_completion_t *mca_btl_ofi_rdma_completion_alloc (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,
@ -32,21 +32,24 @@ mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
assert(endpoint);
assert(ofi_context);
mca_btl_ofi_completion_t *comp;
mca_btl_ofi_rdma_completion_t *comp;
comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_context->comp_list);
comp = (mca_btl_ofi_rdma_completion_t*) opal_free_list_get(&ofi_context->rdma_comp_list);
assert(comp);
comp->btl = btl;
comp->endpoint = endpoint;
comp->my_context = ofi_context;
comp->base.btl = btl;
comp->base.endpoint = endpoint;
comp->base.my_context = ofi_context;
comp->base.my_list = &ofi_context->rdma_comp_list;
comp->base.type = type;
comp->local_address = local_address;
comp->local_handle = local_handle;
comp->cbfunc = cbfunc;
comp->cbcontext = cbcontext;
comp->cbdata = cbdata;
comp->my_list = &ofi_context->comp_list;
comp->type = type;
comp->comp_ctx.comp = comp;
return comp;
}
@ -58,21 +61,21 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
{
int rc;
mca_btl_ofi_rdma_completion_t *comp;
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
mca_btl_ofi_completion_t *comp;
mca_btl_ofi_context_t *ofi_context;
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_GET);
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_GET);
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
@ -82,7 +85,7 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
local_handle->desc,
btl_endpoint->peer_addr,
remote_address, remote_handle->rkey,
comp); /* completion context */
&comp->comp_ctx); /* completion context */
if (-FI_EAGAIN == rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
@ -111,13 +114,13 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
ofi_context = get_ofi_context(ofi_btl);
/* create completion context */
mca_btl_ofi_completion_t *comp;
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_PUT);
mca_btl_ofi_rdma_completion_t *comp;
comp = mca_btl_ofi_rdma_completion_alloc(btl, endpoint,
ofi_context,
local_address,
local_handle,
cbfunc, cbcontext, cbdata,
MCA_BTL_OFI_TYPE_PUT);
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
@ -127,7 +130,7 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
local_handle->desc,
btl_endpoint->peer_addr,
remote_address, remote_handle->rkey,
comp); /* completion context */
&comp->comp_ctx); /* completion context */
if (-FI_EAGAIN == rc) {
return OPAL_ERR_OUT_OF_RESOURCE;

Просмотреть файл

@ -19,7 +19,7 @@
#include "btl_ofi.h"
#include "btl_ofi_endpoint.h"
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
mca_btl_ofi_rdma_completion_t *mca_btl_ofi_rdma_completion_alloc (
mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_btl_ofi_context_t *ofi_context,