btl/ugni: improve multi-threaded RDMA performance
This commit improves the injection rate and latency for RDMA operations. This is done by the following improvements: - If C11's _Thread_local keyword is available then always use the same virtual device index for the same thread when using RDMA. If the keyword is not available then attempt to use any device that isn't already in use. The binding support is enabled by default but can be disabled via the btl_ugni_bind_devices MCA variable. - When posting FMA and RDMA operations always attempt to reap completions after posting the operation. This allows us to better balance the work of reaping completions across all application threads. - Limit the total number of outstanding BTE transactions. This fixes a performance bug when using many threads. - Split out RDMA and local SMSG completion queue sizes. The RDMA queue size is better tuned for performance with RMA-MT. - Split out put and get FMA limits. The old btl_ugni_fma_limit MCA variable is deprecated. The new variable names are: btl_ugni_fma_put_limit and btl_ugni_fma_get_limit. - Change how post descriptors are handled. They are no longer allocated seperately from the RDMA endpoints. - Some cleanup to move error code out of the critical path. - Disable the FMA sharing flag on the CDM when we detect that there should be enough FMA descriptors for the number of virtual devices we plan will create. If the user sets this flag we will not unset it. This change should improve the small-message RMA performance by ~ 10%. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
0ddbc75ce5
Коммит
b0ac6276a6
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -51,7 +51,7 @@
|
||||
#define MCA_BTL_UGNI_MAX_DEV_HANDLES 128
|
||||
|
||||
/** number of rdma completion queue items to remove per progress loop */
|
||||
#define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 16
|
||||
#define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 32
|
||||
|
||||
/** how often to check for connection requests */
|
||||
#define MCA_BTL_UGNI_CONNECT_USEC 10
|
||||
@ -96,7 +96,7 @@ struct mca_btl_ugni_cq_t {
|
||||
/** ugni CQ handle */
|
||||
gni_cq_handle_t gni_handle;
|
||||
/** number of completions expected on the CQ */
|
||||
int32_t active_operations;
|
||||
volatile int32_t active_operations;
|
||||
};
|
||||
typedef struct mca_btl_ugni_cq_t mca_btl_ugni_cq_t;
|
||||
|
||||
@ -116,6 +116,9 @@ struct mca_btl_ugni_device_t {
|
||||
/** number of SMSG connections */
|
||||
volatile int32_t smsg_connections;
|
||||
|
||||
/** boolean indicating that the device was recently flushed */
|
||||
volatile bool flushed;
|
||||
|
||||
/** uGNI device handle */
|
||||
gni_nic_handle_t dev_handle;
|
||||
|
||||
@ -132,10 +135,7 @@ struct mca_btl_ugni_device_t {
|
||||
gni_mem_handle_t smsg_irq_mhndl;
|
||||
|
||||
/** RDMA endpoint free list */
|
||||
opal_free_list_t endpoints;
|
||||
|
||||
/** post descriptors pending resources */
|
||||
opal_list_t pending_post;
|
||||
opal_free_list_t rdma_descs;
|
||||
};
|
||||
typedef struct mca_btl_ugni_device_t mca_btl_ugni_device_t;
|
||||
|
||||
@ -162,8 +162,6 @@ typedef struct mca_btl_ugni_module_t {
|
||||
opal_mutex_t eager_get_pending_lock;
|
||||
opal_list_t eager_get_pending;
|
||||
|
||||
opal_free_list_t post_descriptors;
|
||||
|
||||
mca_mpool_base_module_t *mpool;
|
||||
opal_free_list_t smsg_mboxes;
|
||||
|
||||
@ -196,9 +194,7 @@ typedef struct mca_btl_ugni_module_t {
|
||||
* this rank should be limited too */
|
||||
int nlocal_procs;
|
||||
|
||||
volatile int active_send_count;
|
||||
volatile int64_t connected_peer_count;
|
||||
volatile int64_t active_rdma_count;
|
||||
volatile int32_t active_rdma_count;
|
||||
|
||||
mca_rcache_base_module_t *rcache;
|
||||
} mca_btl_ugni_module_t;
|
||||
@ -212,6 +208,10 @@ typedef struct mca_btl_ugni_component_t {
|
||||
/* Maximum number of entries a completion queue can hold */
|
||||
uint32_t remote_cq_size;
|
||||
uint32_t local_cq_size;
|
||||
uint32_t local_rdma_cq_size;
|
||||
/* There is a hardware limitation that hurts BTE performance
|
||||
* if we submit too many BTE requests. This acts as a throttle. */
|
||||
int32_t active_rdma_threshold;
|
||||
|
||||
/* number of ugni modules */
|
||||
uint32_t ugni_num_btls;
|
||||
@ -221,7 +221,16 @@ typedef struct mca_btl_ugni_component_t {
|
||||
size_t smsg_max_data;
|
||||
|
||||
/* After this message size switch to BTE protocols */
|
||||
size_t ugni_fma_limit;
|
||||
long int ugni_fma_limit;
|
||||
/** FMA switchover for get */
|
||||
long int ugni_fma_get_limit;
|
||||
/** FMA switchover for put */
|
||||
long int ugni_fma_put_limit;
|
||||
|
||||
#if OPAL_C_HAVE__THREAD_LOCAL
|
||||
bool bind_threads_to_devices;
|
||||
#endif
|
||||
|
||||
/* Switch to get when sending above this size */
|
||||
size_t ugni_smsg_limit;
|
||||
|
||||
@ -282,6 +291,9 @@ typedef struct mca_btl_ugni_component_t {
|
||||
|
||||
/** NIC address */
|
||||
uint32_t dev_addr;
|
||||
|
||||
/** MCA variable identifier for the cdm_flags variable */
|
||||
int cdm_flags_id;
|
||||
} mca_btl_ugni_component_t;
|
||||
|
||||
/* Global structures */
|
||||
@ -289,18 +301,20 @@ typedef struct mca_btl_ugni_component_t {
|
||||
OPAL_MODULE_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component;
|
||||
OPAL_MODULE_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module;
|
||||
|
||||
static inline uint32_t mca_btl_ugni_ep_get_device_index (mca_btl_ugni_module_t *ugni_module)
|
||||
{
|
||||
static volatile uint32_t device_index = (uint32_t) 0;
|
||||
|
||||
/* don't really care if the device index is atomically updated */
|
||||
return opal_atomic_fetch_add_32 (&device_index, 1) % mca_btl_ugni_component.virtual_device_count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a virtual device for communication
|
||||
*/
|
||||
static inline mca_btl_ugni_device_t *mca_btl_ugni_ep_get_device (mca_btl_ugni_module_t *ugni_module)
|
||||
{
|
||||
static volatile uint32_t device_index = (uint32_t) 0;
|
||||
uint32_t dev_index;
|
||||
|
||||
/* don't really care if the device index is atomically updated */
|
||||
dev_index = (device_index++) & (mca_btl_ugni_component.virtual_device_count - 1);
|
||||
|
||||
return ugni_module->devices + dev_index;
|
||||
return ugni_module->devices + mca_btl_ugni_ep_get_device_index (ugni_module);
|
||||
}
|
||||
|
||||
static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc)
|
||||
@ -322,6 +336,9 @@ static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc)
|
||||
return codes[rc];
|
||||
}
|
||||
|
||||
|
||||
int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
/**
|
||||
* BML->BTL notification of change in the process list.
|
||||
*
|
||||
@ -481,6 +498,16 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) {
|
||||
int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t* btl);
|
||||
int mca_btl_ugni_kill_progress_thread(void);
|
||||
|
||||
struct mca_btl_ugni_post_descriptor_t;
|
||||
|
||||
void btl_ugni_dump_post_desc (struct mca_btl_ugni_post_descriptor_t *desc);
|
||||
|
||||
|
||||
struct mca_btl_ugni_post_descriptor_t;
|
||||
|
||||
void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
|
||||
struct mca_btl_ugni_post_descriptor_t *post_desc, const int count);
|
||||
|
||||
/**
|
||||
* Try to lock a uGNI device for exclusive access
|
||||
*/
|
||||
@ -531,6 +558,58 @@ static inline intptr_t mca_btl_ugni_device_serialize (mca_btl_ugni_device_t *dev
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_device_serialize_any (mca_btl_ugni_module_t *ugni_module,
|
||||
mca_btl_ugni_device_serialize_fn_t fn, void *arg)
|
||||
{
|
||||
mca_btl_ugni_device_t *device;
|
||||
intptr_t rc;
|
||||
|
||||
if (!opal_using_threads ()) {
|
||||
return fn (ugni_module->devices, arg);
|
||||
}
|
||||
|
||||
#if OPAL_C_HAVE__THREAD_LOCAL
|
||||
if (mca_btl_ugni_component.bind_threads_to_devices) {
|
||||
/* NTH: if we have C11 _Thread_local just go ahead and assign the devices round-robin to each
|
||||
* thread. in testing this give much better performance than just picking any device */
|
||||
static _Thread_local mca_btl_ugni_device_t *device_local = NULL;
|
||||
|
||||
device = device_local;
|
||||
if (OPAL_UNLIKELY(NULL == device)) {
|
||||
/* assign device contexts round-robin */
|
||||
device_local = device = mca_btl_ugni_ep_get_device (ugni_module);
|
||||
}
|
||||
|
||||
mca_btl_ugni_device_lock (device);
|
||||
} else {
|
||||
#endif
|
||||
/* get the next starting index */
|
||||
uint32_t device_index = mca_btl_ugni_ep_get_device_index (ugni_module);
|
||||
const int device_count = mca_btl_ugni_component.virtual_device_count;
|
||||
|
||||
for (int i = 0 ; i < device_count ; ++i) {
|
||||
device = ugni_module->devices + ((device_index + i) % device_count);
|
||||
if (!mca_btl_ugni_device_trylock (device)) {
|
||||
break;
|
||||
}
|
||||
|
||||
device = NULL;
|
||||
}
|
||||
|
||||
if (NULL == device) {
|
||||
device = mca_btl_ugni_ep_get_device (ugni_module);
|
||||
mca_btl_ugni_device_lock (device);
|
||||
}
|
||||
#if OPAL_C_HAVE__THREAD_LOCAL
|
||||
}
|
||||
#endif
|
||||
|
||||
rc = fn (device, arg);
|
||||
mca_btl_ugni_device_unlock (device);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/** Number of times the progress thread has woken up */
|
||||
extern unsigned int mca_btl_ugni_progress_thread_wakeups;
|
||||
|
@ -79,7 +79,7 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
|
||||
if (false == ugni_module->initialized) {
|
||||
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
|
||||
mca_btl_ugni_device_t *device = ugni_module->devices + i;
|
||||
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size, 0,
|
||||
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_rdma_cq_size, 0,
|
||||
GNI_CQ_NOBLOCK, NULL, NULL, &device->dev_rdma_local_cq.gni_handle);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating local BTE/FMA CQ"));
|
||||
@ -94,7 +94,7 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
|
||||
}
|
||||
|
||||
if (mca_btl_ugni_component.progress_thread_enabled) {
|
||||
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size,
|
||||
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_rdma_cq_size,
|
||||
0, GNI_CQ_BLOCKING, NULL, NULL, &device->dev_rdma_local_irq_cq.gni_handle);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating local BTE/FMA CQ"));
|
||||
@ -448,15 +448,6 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = opal_free_list_init (&ugni_module->post_descriptors,
|
||||
sizeof (mca_btl_ugni_post_descriptor_t),
|
||||
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
|
||||
0, 0, 0, -1, 256, NULL, 0, NULL, NULL, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("error creating post descriptor free list"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -79,8 +79,8 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
gni_mem_handle_t dummy = {0, 0};
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int gni_op, rc, type;
|
||||
mca_btl_ugni_post_descriptor_t post_desc;
|
||||
int gni_op, type;
|
||||
size_t size;
|
||||
|
||||
size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
|
||||
@ -95,23 +95,13 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
init_post_desc (&post_desc, endpoint, order, GNI_POST_AMO, 0, dummy, remote_address,
|
||||
remote_handle->gni_handle, size, 0, cbfunc, cbcontext, cbdata,
|
||||
NULL);
|
||||
post_desc.gni_desc.amo_cmd = gni_op;
|
||||
post_desc.gni_desc.first_operand = operand;
|
||||
|
||||
init_gni_post_desc (post_desc, order, GNI_POST_AMO, 0, dummy, remote_address,
|
||||
remote_handle->gni_handle, size, 0);
|
||||
post_desc->desc.amo_cmd = gni_op;
|
||||
|
||||
post_desc->desc.first_operand = operand;
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_return_post_descriptor (post_desc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
|
||||
}
|
||||
|
||||
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
@ -120,8 +110,8 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int gni_op, rc, type;
|
||||
mca_btl_ugni_post_descriptor_t post_desc;
|
||||
int gni_op, type;
|
||||
size_t size;
|
||||
|
||||
size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
|
||||
@ -136,24 +126,13 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
init_post_desc (&post_desc, endpoint, order, GNI_POST_AMO, (intptr_t) local_address,
|
||||
local_handle->gni_handle, remote_address, remote_handle->gni_handle,
|
||||
size, 0, cbfunc, cbcontext, cbdata, local_handle);
|
||||
post_desc.gni_desc.amo_cmd = gni_op;
|
||||
post_desc.gni_desc.first_operand = operand;
|
||||
|
||||
|
||||
init_gni_post_desc (post_desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0);
|
||||
post_desc->desc.amo_cmd = gni_op;
|
||||
|
||||
post_desc->desc.first_operand = operand;
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_return_post_descriptor (post_desc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
|
||||
}
|
||||
|
||||
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
@ -161,30 +140,19 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int gni_op, rc;
|
||||
mca_btl_ugni_post_descriptor_t post_desc;
|
||||
size_t size;
|
||||
int gni_op;
|
||||
|
||||
gni_op = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? GNI_FMA_ATOMIC2_CSWAP_S : GNI_FMA_ATOMIC_CSWAP;
|
||||
size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
|
||||
|
||||
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
init_post_desc (&post_desc, endpoint, order, GNI_POST_AMO, (intptr_t) local_address,
|
||||
local_handle->gni_handle, remote_address, remote_handle->gni_handle, size, 0,
|
||||
cbfunc, cbcontext, cbdata, local_handle);
|
||||
post_desc.gni_desc.amo_cmd = gni_op;
|
||||
post_desc.gni_desc.first_operand = compare;
|
||||
post_desc.gni_desc.second_operand = value;
|
||||
|
||||
|
||||
init_gni_post_desc (post_desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0);
|
||||
post_desc->desc.amo_cmd = gni_op;
|
||||
|
||||
post_desc->desc.first_operand = compare;
|
||||
post_desc->desc.second_operand = value;
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_return_post_descriptor (post_desc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
@ -155,14 +155,23 @@ static int btl_ugni_component_register(void)
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.remote_cq_size);
|
||||
|
||||
mca_btl_ugni_component.local_cq_size = 8192;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"local_cq_size", "Local completion queue size "
|
||||
"(default 8192)", MCA_BASE_VAR_TYPE_INT,
|
||||
"local_cq_size", "Local SMSG completion queue size "
|
||||
"(default 8k)", MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.local_cq_size);
|
||||
|
||||
mca_btl_ugni_component.local_rdma_cq_size = 1024;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"local_rdma_cq_size", "Local FMA/RDMA completion queue size "
|
||||
"(default: 1024)",MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.local_rdma_cq_size);
|
||||
|
||||
mca_btl_ugni_component.ugni_smsg_limit = 0;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"smsg_limit", "Maximum size message that "
|
||||
@ -182,16 +191,51 @@ static int btl_ugni_component_register(void)
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.smsg_max_credits);
|
||||
|
||||
mca_btl_ugni_component.ugni_fma_limit = 1024;
|
||||
#if OPAL_C_HAVE__THREAD_LOCAL
|
||||
mca_btl_ugni_component.bind_threads_to_devices = true;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"fma_limit", "Maximum size message that "
|
||||
"will be sent using the FMA (Fast Memory "
|
||||
"Access) protocol (default 1024, 64k max)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
"bind_devices", "Bind threads to virtual "
|
||||
"devices. In general this should improve "
|
||||
"RDMA performance (default: true)",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.bind_threads_to_devices);
|
||||
#endif
|
||||
|
||||
mca_btl_ugni_component.ugni_fma_limit = -1;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"fma_limit", "Default maximum size message that "
|
||||
"will be sent using the FMA (Fast Memory "
|
||||
"Access) protocol (default: -1 (don't use), 64k max)",
|
||||
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DEPRECATED,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_fma_limit);
|
||||
|
||||
mca_btl_ugni_component.ugni_fma_get_limit = 2048;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"fma_get_limit", "Maximum size message that "
|
||||
"will be sent using the FMA (Fast Memory "
|
||||
"Access) protocol for get (default 2k, "
|
||||
"64k max)",
|
||||
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_fma_get_limit);
|
||||
|
||||
mca_btl_ugni_component.ugni_fma_put_limit = 4096;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"fma_put_limit", "Maximum size message that "
|
||||
"will be sent using the FMA (Fast Memory "
|
||||
"Access) protocol for put (default: 4k, "
|
||||
"64k max)",
|
||||
MCA_BASE_VAR_TYPE_LONG, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_fma_put_limit);
|
||||
|
||||
mca_btl_ugni_component.rdma_max_retries = 16;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
||||
@ -232,9 +276,9 @@ static int btl_ugni_component_register(void)
|
||||
|
||||
mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL |
|
||||
GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
mca_btl_ugni_component.cdm_flags_id = mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"cdm_flags", "Flags to set when creating a communication domain "
|
||||
" (default: fork-fullcopy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
|
||||
" (default: fork-full-copy,cached-amo-enabled,err-no-kill,fast-datagram-poll,"
|
||||
"fma-shared,fma-small-window)",
|
||||
MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
||||
@ -244,7 +288,7 @@ static int btl_ugni_component_register(void)
|
||||
mca_btl_ugni_component.virtual_device_count = 0;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"virtual_device_count", "Number of virtual devices to create. Higher numbers may "
|
||||
"result in better performance when using threads. (default: auto, max: 8)",
|
||||
"result in better performance when using threads. (default: 0 (auto), max: 128)",
|
||||
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count);
|
||||
@ -392,7 +436,7 @@ static int btl_ugni_component_register(void)
|
||||
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
|
||||
|
||||
mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0;
|
||||
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_limit;
|
||||
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
|
||||
|
||||
/* Call the BTL based to register its MCA params */
|
||||
mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
|
||||
@ -418,10 +462,8 @@ btl_ugni_component_close(void)
|
||||
{
|
||||
mca_btl_ugni_fini ();
|
||||
|
||||
if (mca_btl_ugni_component.modules) {
|
||||
free (mca_btl_ugni_component.modules);
|
||||
mca_btl_ugni_component.modules = NULL;
|
||||
}
|
||||
free (mca_btl_ugni_component.modules);
|
||||
mca_btl_ugni_component.modules = NULL;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -443,7 +485,22 @@ mca_btl_ugni_component_init (int *num_btl_modules,
|
||||
mca_btl_ugni_component.ugni_fma_limit = 65536;
|
||||
}
|
||||
|
||||
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_limit;
|
||||
if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
|
||||
mca_btl_ugni_component.ugni_fma_get_limit = mca_btl_ugni_component.ugni_fma_limit;
|
||||
} else if (65536 < mca_btl_ugni_component.ugni_fma_get_limit) {
|
||||
mca_btl_ugni_component.ugni_fma_get_limit = 65536;
|
||||
}
|
||||
|
||||
if (-1 != mca_btl_ugni_component.ugni_fma_limit) {
|
||||
mca_btl_ugni_component.ugni_fma_put_limit = mca_btl_ugni_component.ugni_fma_limit;
|
||||
} else if (65536 < mca_btl_ugni_component.ugni_fma_put_limit) {
|
||||
mca_btl_ugni_component.ugni_fma_put_limit = 65536;
|
||||
}
|
||||
|
||||
mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component.ugni_fma_put_limit;
|
||||
|
||||
/* limit the number of outstanding RDMA operations over all devices */
|
||||
mca_btl_ugni_component.active_rdma_threshold = mca_btl_ugni_component.local_rdma_cq_size;
|
||||
|
||||
if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) {
|
||||
mca_btl_ugni_component.progress_thread_enabled = 1;
|
||||
@ -562,108 +619,43 @@ int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
|
||||
return count;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
|
||||
void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
|
||||
struct mca_btl_ugni_post_descriptor_t *post_desc, const int count)
|
||||
{
|
||||
int bte_complete = 0;
|
||||
|
||||
fprintf (stderr, "desc->desc.post_id = %" PRIx64 "\n", desc->desc.post_id);
|
||||
fprintf (stderr, "desc->desc.status = %" PRIx64 "\n", desc->desc.status);
|
||||
fprintf (stderr, "desc->desc.cq_mode_complete = %hu\n", desc->desc.cq_mode_complete);
|
||||
fprintf (stderr, "desc->desc.type = %d\n", desc->desc.type);
|
||||
fprintf (stderr, "desc->desc.cq_mode = %hu\n", desc->desc.cq_mode);
|
||||
fprintf (stderr, "desc->desc.dlvr_mode = %hu\n", desc->desc.dlvr_mode);
|
||||
fprintf (stderr, "desc->desc.local_addr = %" PRIx64 "\n", desc->desc.local_addr);
|
||||
fprintf (stderr, "desc->desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.local_mem_hndl.qword1,
|
||||
desc->desc.local_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->desc.remote_addr = %" PRIx64 "\n", desc->desc.remote_addr);
|
||||
fprintf (stderr, "desc->desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.remote_mem_hndl.qword1,
|
||||
desc->desc.remote_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->desc.length = %" PRIu64 "\n", desc->desc.length);
|
||||
fprintf (stderr, "desc->desc.rdma_mode = %hu\n", desc->desc.rdma_mode);
|
||||
fprintf (stderr, "desc->desc.amo_cmd = %d\n", desc->desc.amo_cmd);
|
||||
}
|
||||
#endif
|
||||
for (int i = 0 ; i < count ; ++i) {
|
||||
BTL_VERBOSE(("post descriptor complete. status: %d", post_dest[i].rc));
|
||||
|
||||
static inline int
|
||||
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device)
|
||||
{
|
||||
int pending_post_count = opal_list_get_size (&device->pending_post);
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != post_desc[i].rc)) {
|
||||
/* dump the post descriptor if in a debug build */
|
||||
btl_ugni_dump_post_desc (post_desc + i);
|
||||
}
|
||||
|
||||
/* check if there are any posts pending resources */
|
||||
if (OPAL_LIKELY(0 == pending_post_count)) {
|
||||
return 0;
|
||||
bte_complete += post_desc[i].use_bte == true;
|
||||
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc + i, post_desc[i].rc);
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count));
|
||||
for (int i = 0 ; i < pending_post_count ; ++i) {
|
||||
mca_btl_ugni_device_lock (device);
|
||||
post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post);
|
||||
mca_btl_ugni_device_unlock (device);
|
||||
if (NULL == post_desc) {
|
||||
break;
|
||||
}
|
||||
rc = mca_btl_ugni_repost (ugni_module, post_desc);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
mca_btl_ugni_device_lock (device);
|
||||
opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc);
|
||||
mca_btl_ugni_device_unlock (device);
|
||||
break;
|
||||
}
|
||||
if (bte_complete > 0) {
|
||||
(void) OPAL_THREAD_FETCH_ADD32 (&ugni_module->active_rdma_count, -bte_complete);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
|
||||
mca_btl_ugni_cq_t *cq)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
|
||||
gni_cq_entry_t event_data[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
|
||||
mca_btl_ugni_post_descriptor_t post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_cq_get_completed_desc (device, cq, event_data, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
|
||||
rc = mca_btl_ugni_cq_get_completed_desc (device, cq, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
|
||||
if (0 >= rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("got %d completed rdma descriptors", rc));
|
||||
|
||||
for (int i = 0 ; i < rc ; ++i) {
|
||||
BTL_VERBOSE(("post descriptor %p complete. GNI_CQ_STATUS_OK(): %d", (void*)post_desc[i],
|
||||
GNI_CQ_STATUS_OK(event_data[i])));
|
||||
|
||||
if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data[i]))) {
|
||||
uint32_t recoverable = 1;
|
||||
|
||||
(void) GNI_CqErrorRecoverable (event_data[i], &recoverable);
|
||||
|
||||
if (OPAL_UNLIKELY(++post_desc[i]->tries >= mca_btl_ugni_component.rdma_max_retries ||
|
||||
!recoverable)) {
|
||||
char char_buffer[1024];
|
||||
GNI_CqErrorStr (event_data[i], char_buffer, 1024);
|
||||
/* give up */
|
||||
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc[i],
|
||||
recoverable, char_buffer));
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
btl_ugni_dump_post_desc (post_desc[i]);
|
||||
#endif
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_ERROR);
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
mca_btl_ugni_repost (ugni_module, post_desc[i]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_SUCCESS);
|
||||
}
|
||||
|
||||
/* should be resources to progress the pending post list */
|
||||
(void) mca_btl_ugni_post_pending (ugni_module, device);
|
||||
mca_btl_ugni_handle_rdma_completions (ugni_module, device, post_desc, rc);
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -734,3 +726,44 @@ static int mca_btl_ugni_component_progress (void)
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
|
||||
|
||||
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
|
||||
mca_btl_ugni_device_t *device = ugni_module->devices + i;
|
||||
/* spin on progress until all active operations are complete. it is tempting to
|
||||
* take an initial count then wait until that many operations have been completed
|
||||
* but it is impossible to tell if those are the operations the caller is waiting
|
||||
* on. */
|
||||
while (device->dev_rdma_local_cq.active_operations) {
|
||||
(void) mca_btl_ugni_progress_rdma (ugni_module, device, &device->dev_rdma_local_cq);
|
||||
}
|
||||
|
||||
/* mark that the device was recently flushed */
|
||||
device->flushed = true;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
|
||||
{
|
||||
|
||||
fprintf (stderr, "desc->gni_desc.post_id = %" PRIx64 "\n", desc->gni_desc.post_id);
|
||||
fprintf (stderr, "desc->gni_desc.status = %" PRIx64 "\n", desc->gni_desc.status);
|
||||
fprintf (stderr, "desc->gni_desc.cq_mode_complete = %hu\n", desc->gni_desc.cq_mode_complete);
|
||||
fprintf (stderr, "desc->gni_desc.type = %d\n", desc->gni_desc.type);
|
||||
fprintf (stderr, "desc->gni_desc.cq_mode = %hu\n", desc->gni_desc.cq_mode);
|
||||
fprintf (stderr, "desc->gni_desc.dlvr_mode = %hu\n", desc->gni_desc.dlvr_mode);
|
||||
fprintf (stderr, "desc->gni_desc.local_addr = %" PRIx64 "\n", desc->gni_desc.local_addr);
|
||||
fprintf (stderr, "desc->gni_desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.local_mem_hndl.qword1,
|
||||
desc->gni_desc.local_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->gni_desc.remote_addr = %" PRIx64 "\n", desc->gni_desc.remote_addr);
|
||||
fprintf (stderr, "desc->gni_desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.remote_mem_hndl.qword1,
|
||||
desc->gni_desc.remote_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->gni_desc.length = %" PRIu64 "\n", desc->gni_desc.length);
|
||||
fprintf (stderr, "desc->gni_desc.rdma_mode = %hu\n", desc->gni_desc.rdma_mode);
|
||||
fprintf (stderr, "desc->gni_desc.amo_cmd = %d\n", desc->gni_desc.amo_cmd);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -26,6 +26,31 @@
|
||||
#include "btl_ugni_frag.h"
|
||||
|
||||
/* helper functions */
|
||||
/**
|
||||
* @brief Output an error message on CQ or completion error.
|
||||
*
|
||||
* @param[in] grc GNI error from GNI_CqGetEvent or GNI_GetCompleted
|
||||
* @param[in] event_data event data from GNI_CqGetEvent
|
||||
*
|
||||
* This is a small function to print out an error if an error
|
||||
* was detected on a CQ event.
|
||||
*/
|
||||
int mca_btl_ugni_event_fatal_error (gni_return_t grc, gni_cq_entry_t event_data);
|
||||
|
||||
/**
|
||||
* @brief Attempt to re-post an rdma descriptor
|
||||
*
|
||||
* @param[in] rdma_desc RDMA descriptor that failed
|
||||
* @param[in] event_data CQ event data
|
||||
*
|
||||
* @returns OPAL_SUCCESS if the descriptor was re-posted
|
||||
* @returns OPAL_ERROR otherwise
|
||||
*
|
||||
* This function checks if the error is recoverable and re-posts the
|
||||
* descriptor if possible. The device lock MUST be held when this
|
||||
* function is called.
|
||||
*/
|
||||
int mca_btl_ugni_device_handle_event_error (struct mca_btl_ugni_rdma_desc_t *rdma_desc, gni_cq_entry_t event_data);
|
||||
|
||||
typedef struct mca_btl_ugni_smsg_send_wtag_arg_t {
|
||||
gni_ep_handle_t ep_handle;
|
||||
@ -67,19 +92,6 @@ static inline intptr_t mca_btl_ugni_smsg_release_device (mca_btl_ugni_device_t *
|
||||
return GNI_SmsgRelease (ep_handle->gni_handle);
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_cq_clear_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
{
|
||||
gni_cq_handle_t cq = (gni_cq_handle_t) (intptr_t) arg;
|
||||
gni_cq_entry_t event_data;
|
||||
int rc;
|
||||
|
||||
do {
|
||||
rc = GNI_CqGetEvent (cq, &event_data);
|
||||
} while (GNI_RC_NOT_DONE != rc);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
typedef struct mca_btl_ugni_cq_get_event_args_t {
|
||||
mca_btl_ugni_cq_t *cq;
|
||||
gni_cq_entry_t *event_data;
|
||||
@ -91,10 +103,23 @@ static inline intptr_t mca_btl_ugni_cq_get_event_device (mca_btl_ugni_device_t *
|
||||
gni_return_t rc;
|
||||
|
||||
rc = GNI_CqGetEvent (args->cq->gni_handle, args->event_data);
|
||||
args->cq->active_operations -= GNI_RC_NOT_DONE != rc;
|
||||
args->cq->active_operations -= (GNI_RC_NOT_DONE != rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_cq_clear_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
{
|
||||
gni_cq_handle_t cq = (gni_cq_handle_t) (intptr_t) arg;
|
||||
gni_cq_entry_t event_data;
|
||||
int rc;
|
||||
|
||||
do {
|
||||
rc = GNI_CqGetEvent (cq, &event_data);
|
||||
} while (GNI_RC_NOT_DONE != rc);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
typedef struct mca_btl_ugni_gni_cq_get_event_args_t {
|
||||
gni_cq_handle_t cq;
|
||||
gni_cq_entry_t *event_data;
|
||||
@ -107,148 +132,247 @@ static inline intptr_t mca_btl_ugni_gni_cq_get_event_device (mca_btl_ugni_device
|
||||
return GNI_CqGetEvent (args->cq, args->event_data);
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_post_fma_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
typedef struct mca_btl_ugni_cq_get_completed_desc_arg_t {
|
||||
mca_btl_ugni_cq_t *cq;
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int count;
|
||||
} mca_btl_ugni_cq_get_completed_desc_arg_t;
|
||||
|
||||
__opal_attribute_always_inline__
|
||||
static inline int _mca_btl_ugni_repost_rdma_desc_device (mca_btl_ugni_device_t *device, mca_btl_ugni_rdma_desc_t *rdma_desc)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *desc = (mca_btl_ugni_post_descriptor_t *) arg;
|
||||
bool ep_handle_allocated = false;
|
||||
mca_btl_ugni_post_descriptor_t *post_desc = &rdma_desc->btl_ugni_desc;
|
||||
int rc;
|
||||
|
||||
if (NULL == desc->ep_handle) {
|
||||
desc->ep_handle = mca_btl_ugni_ep_get_rdma (desc->endpoint, device);
|
||||
if (OPAL_UNLIKELY(NULL == desc->ep_handle)) {
|
||||
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
ep_handle_allocated = true;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("Posting FMA descriptor %p with op_type %d, amo %d, ep_handle %p, remote_addr 0x%lx, "
|
||||
"length %lu", (void*)desc, desc->desc.type, desc->desc.amo_cmd, (void*)desc->ep_handle,
|
||||
desc->desc.remote_addr, desc->desc.length));
|
||||
|
||||
rc = GNI_PostFma (desc->ep_handle->gni_handle, &desc->desc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
if (ep_handle_allocated) {
|
||||
/* only return the endpoint handle if we allocated it. if we didn't allocate the
|
||||
* handle this call was likely made from repost() */
|
||||
mca_btl_ugni_ep_return_rdma (desc->ep_handle);
|
||||
desc->ep_handle = NULL;
|
||||
}
|
||||
if (post_desc->use_bte) {
|
||||
rc = GNI_PostRdma (rdma_desc->gni_handle, &post_desc->gni_desc);
|
||||
} else {
|
||||
++device->dev_rdma_local_cq.active_operations;
|
||||
rc = GNI_PostFma (rdma_desc->gni_handle, &post_desc->gni_desc);
|
||||
}
|
||||
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_post_rdma_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
static inline intptr_t _mca_btl_ugni_cq_get_completed_desc_device (mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq,
|
||||
mca_btl_ugni_post_descriptor_t *post_desc,
|
||||
const int count, bool block)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *desc = (mca_btl_ugni_post_descriptor_t *) arg;
|
||||
bool ep_handle_allocated = false;
|
||||
mca_btl_ugni_rdma_desc_t *rdma_desc;
|
||||
gni_post_descriptor_t *desc;
|
||||
gni_cq_entry_t event_data;
|
||||
int rc, desc_index = 0;
|
||||
|
||||
for (desc_index = 0 ; desc_index < count && cq->active_operations ; ) {
|
||||
int desc_rc = OPAL_SUCCESS;
|
||||
|
||||
rc = GNI_CqGetEvent (cq->gni_handle, &event_data);
|
||||
if (GNI_RC_NOT_DONE == rc) {
|
||||
if (block) {
|
||||
/* try again */
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
block = false;
|
||||
|
||||
rc = GNI_GetCompleted (cq->gni_handle, event_data, &desc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
|
||||
return mca_btl_ugni_event_fatal_error (rc, event_data);
|
||||
}
|
||||
|
||||
rdma_desc = MCA_BTL_UGNI_GNI_DESC_TO_RDMA_DESC(desc);
|
||||
|
||||
if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data))) {
|
||||
desc_rc = mca_btl_ugni_device_handle_event_error (rdma_desc, event_data);
|
||||
if (OPAL_LIKELY(OPAL_SUCCESS == desc_rc)) {
|
||||
/* descriptor was re-posted */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* copy back the descriptor only if additional processing is needed. in this case more processing
|
||||
* is needed if a user callback is specified or the bte was in use. */
|
||||
if (rdma_desc->btl_ugni_desc.cbfunc || rdma_desc->btl_ugni_desc.use_bte || OPAL_SUCCESS != desc_rc) {
|
||||
post_desc[desc_index] = rdma_desc->btl_ugni_desc;
|
||||
post_desc[desc_index++].rc = desc_rc;
|
||||
}
|
||||
|
||||
/* return the descriptor while we have the lock. this is done so we can avoid using the
|
||||
* free list atomics (as both push and pop are done with the lock) */
|
||||
mca_btl_ugni_return_rdma_desc (rdma_desc);
|
||||
--cq->active_operations;
|
||||
}
|
||||
|
||||
return desc_index;
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_cq_get_completed_desc_device (mca_btl_ugni_device_t *device, void *arg0)
|
||||
{
|
||||
mca_btl_ugni_cq_get_completed_desc_arg_t *args = (mca_btl_ugni_cq_get_completed_desc_arg_t *) arg0;
|
||||
|
||||
return _mca_btl_ugni_cq_get_completed_desc_device (device, args->cq, args->post_desc, args->count, false);
|
||||
}
|
||||
|
||||
/* NTH: When posting FMA or RDMA descriptors it makes sense to try and clear out a completion
|
||||
* event after posting the descriptor. This probably gives us a couple of things:
|
||||
* 1) Good locality on the associated data structures (especially with FMA which may
|
||||
* complete fairly quickly).
|
||||
* 2) Since we are already holding the lock it could mean fewer attempts to
|
||||
* lock the device over the course of the program.
|
||||
*
|
||||
* As far as I can tell there is not reason to try and clear out more than a couple
|
||||
* completiong events. The code has been written to allow us to easily modify the
|
||||
* number reaped if we determine that there is a benefit to clearing a different
|
||||
* number of events. */
|
||||
|
||||
/**
|
||||
* @brief Number of events to clear after posting a descriptor
|
||||
*/
|
||||
#define MCA_BTL_UGNI_DEVICE_REAP_COUNT 4
|
||||
|
||||
struct mca_btl_ugni_post_device_args_t {
|
||||
mca_btl_ugni_post_descriptor_t *desc;
|
||||
mca_btl_ugni_device_t *device;
|
||||
int count;
|
||||
mca_btl_ugni_post_descriptor_t completed[MCA_BTL_UGNI_DEVICE_REAP_COUNT];
|
||||
};
|
||||
|
||||
static inline mca_btl_ugni_rdma_desc_t *
|
||||
mca_btl_ugni_get_rdma_desc_device (mca_btl_ugni_device_t *device, struct mca_btl_ugni_post_device_args_t *args, bool use_bte)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *desc = args->desc;
|
||||
mca_btl_ugni_rdma_desc_t *rdma_desc;
|
||||
int count;
|
||||
|
||||
args->device = device;
|
||||
args->count = 0;
|
||||
|
||||
do {
|
||||
rdma_desc = mca_btl_ugni_alloc_rdma_desc (device, desc, use_bte);
|
||||
if (OPAL_LIKELY(NULL != rdma_desc)) {
|
||||
return rdma_desc;
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(NULL == rdma_desc && !args->count)) {
|
||||
args->count = _mca_btl_ugni_cq_get_completed_desc_device (device, &device->dev_rdma_local_cq,
|
||||
args->completed, MCA_BTL_UGNI_DEVICE_REAP_COUNT,
|
||||
true);
|
||||
continue;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
} while (1);
|
||||
}
|
||||
|
||||
|
||||
static inline intptr_t mca_btl_ugni_post_fma_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
{
|
||||
struct mca_btl_ugni_post_device_args_t *args = (struct mca_btl_ugni_post_device_args_t *) arg;
|
||||
mca_btl_ugni_rdma_desc_t *rdma_desc;
|
||||
int rc;
|
||||
|
||||
if (NULL == desc->ep_handle) {
|
||||
desc->ep_handle = mca_btl_ugni_ep_get_rdma (desc->endpoint, device);
|
||||
if (OPAL_UNLIKELY(NULL == desc->ep_handle)) {
|
||||
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
ep_handle_allocated = true;
|
||||
rdma_desc = mca_btl_ugni_get_rdma_desc_device (device, args, false);
|
||||
if (OPAL_UNLIKELY(NULL == rdma_desc)) {
|
||||
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("Posting FMA descriptor %p with op_type %d, amo %d, remote_addr 0x%lx, "
|
||||
"length %lu", (void*)desc, desc->gni_desc.type, desc->gni_desc.amo_cmd,
|
||||
desc->gni_desc.remote_addr, desc->gni_desc.length));
|
||||
|
||||
rc = GNI_PostFma (rdma_desc->gni_handle, &rdma_desc->btl_ugni_desc.gni_desc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
mca_btl_ugni_return_rdma_desc (rdma_desc);
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
++device->dev_rdma_local_cq.active_operations;
|
||||
|
||||
/* to improve bandwidth and latency it is ideal for all posting threads to also reap completions from
|
||||
* the rdma completion queue. there are two optimizations here. 1) for bandwidth we only want to
|
||||
* reap what is available now so more messages can be posted quickly, and 2) for latency (single
|
||||
* put/get before flushing) we want to ensure the operation is complete. To some degree this is
|
||||
* gaming the benchmark but it may benefit some application communication patterns without really
|
||||
* hurting others (in theory). */
|
||||
if (opal_using_threads ()) {
|
||||
int count = args->count;
|
||||
args->count += _mca_btl_ugni_cq_get_completed_desc_device (device, &device->dev_rdma_local_cq,
|
||||
args->completed + count,
|
||||
MCA_BTL_UGNI_DEVICE_REAP_COUNT - count,
|
||||
device->flushed);
|
||||
device->flushed = false;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_post_rdma_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
{
|
||||
struct mca_btl_ugni_post_device_args_t *args = (struct mca_btl_ugni_post_device_args_t *) arg;
|
||||
mca_btl_ugni_rdma_desc_t *rdma_desc;
|
||||
int rc;
|
||||
|
||||
rdma_desc = mca_btl_ugni_get_rdma_desc_device (device, args, true);
|
||||
if (OPAL_UNLIKELY(NULL == rdma_desc)) {
|
||||
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* pick the appropriate CQ */
|
||||
desc->cq = mca_btl_ugni_component.progress_thread_enabled ? &device->dev_rdma_local_irq_cq :
|
||||
rdma_desc->btl_ugni_desc.cq = mca_btl_ugni_component.progress_thread_enabled ? &device->dev_rdma_local_irq_cq :
|
||||
&device->dev_rdma_local_cq;
|
||||
|
||||
desc->desc.src_cq_hndl = desc->cq->gni_handle;
|
||||
BTL_VERBOSE(("Posting RDMA descriptor %p with op_type %d, amo %d, remote_addr 0x%lx, "
|
||||
"length %lu", (void*)desc, desc->gni_desc.type, desc->gni_desc.amo_cmd,
|
||||
desc->gni_desc.remote_addr, desc->gni_desc.length));
|
||||
|
||||
BTL_VERBOSE(("Posting RDMA descriptor %p with op_type %d, ep_handle %p, remote_addr 0x%lx, "
|
||||
"length %lu", (void*)desc, desc->desc.type, (void*)desc->ep_handle, desc->desc.remote_addr,
|
||||
desc->desc.length));
|
||||
|
||||
rc = GNI_PostRdma (desc->ep_handle->gni_handle, &desc->desc);
|
||||
rc = GNI_PostRdma (rdma_desc->gni_handle, &rdma_desc->btl_ugni_desc.gni_desc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
if (ep_handle_allocated) {
|
||||
/* only return the endpoint handle if we allocated it. if we didn't allocate the
|
||||
* handle this call was likely made from repost() */
|
||||
mca_btl_ugni_ep_return_rdma (desc->ep_handle);
|
||||
desc->ep_handle = NULL;
|
||||
}
|
||||
} else {
|
||||
++desc->cq->active_operations;
|
||||
mca_btl_ugni_return_rdma_desc (rdma_desc);
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
++rdma_desc->btl_ugni_desc.cq->active_operations;
|
||||
|
||||
/* to improve bandwidth and latency it is ideal for all posting threads to also reap completions from
|
||||
* the rdma completion queue. there are two optimizations here. 1) for bandwidth we only want to
|
||||
* reap what is available now so more messages can be posted quickly, and 2) for latency (single
|
||||
* put/get before flushing) we want to ensure the operation is complete. To some degree this is
|
||||
* gaming the benchmark but it may benefit some application communication patterns without really
|
||||
* hurting others (in theory). */
|
||||
if (opal_using_threads ()) {
|
||||
int count = args->count;
|
||||
args->count += _mca_btl_ugni_cq_get_completed_desc_device (device, &device->dev_rdma_local_cq,
|
||||
args->completed + count,
|
||||
MCA_BTL_UGNI_DEVICE_REAP_COUNT - count,
|
||||
device->flushed);
|
||||
device->flushed = false;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline intptr_t mca_btl_ugni_post_cqwrite_device (mca_btl_ugni_device_t *device, void *arg)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *desc = (mca_btl_ugni_post_descriptor_t *) arg;
|
||||
mca_btl_ugni_rdma_desc_t *rdma_desc;
|
||||
int rc;
|
||||
|
||||
desc->ep_handle = mca_btl_ugni_ep_get_rdma (desc->endpoint, device);
|
||||
if (OPAL_UNLIKELY(NULL == desc->ep_handle)) {
|
||||
desc->gni_desc.src_cq_hndl = device->dev_rdma_local_cq.gni_handle;
|
||||
|
||||
rdma_desc = mca_btl_ugni_alloc_rdma_desc (device, desc, false);
|
||||
if (OPAL_UNLIKELY(NULL == rdma_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
desc->desc.src_cq_hndl = device->dev_rdma_local_cq.gni_handle;
|
||||
|
||||
rc = GNI_PostCqWrite (desc->ep_handle->gni_handle, &desc->desc);
|
||||
rc = GNI_PostCqWrite (rdma_desc->gni_handle, &rdma_desc->btl_ugni_desc.gni_desc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
mca_btl_ugni_ep_return_rdma (desc->ep_handle);
|
||||
desc->ep_handle = NULL;
|
||||
mca_btl_ugni_return_rdma_desc (rdma_desc);
|
||||
}
|
||||
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
typedef struct mca_btl_ugni_cq_get_completed_desc_arg_t {
|
||||
mca_btl_ugni_cq_t *cq;
|
||||
gni_cq_entry_t *event_data;
|
||||
mca_btl_ugni_post_descriptor_t **post_desc;
|
||||
int count;
|
||||
} mca_btl_ugni_cq_get_completed_desc_arg_t;
|
||||
|
||||
static inline intptr_t mca_btl_ugni_cq_get_completed_desc_device (mca_btl_ugni_device_t *device, void *arg0)
|
||||
{
|
||||
mca_btl_ugni_cq_get_completed_desc_arg_t *args = (mca_btl_ugni_cq_get_completed_desc_arg_t *) arg0;
|
||||
mca_btl_ugni_cq_t *cq = args->cq;
|
||||
gni_post_descriptor_t *desc;
|
||||
int rc;
|
||||
|
||||
for (int i = 0 ; i < args->count ; ++i) {
|
||||
rc = GNI_CqGetEvent (cq->gni_handle, args->event_data + i);
|
||||
if (GNI_RC_NOT_DONE == rc) {
|
||||
return i;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !args->event_data[i]) || GNI_CQ_OVERRUN(args->event_data[i]))) {
|
||||
/* TODO -- need to handle overrun -- how do we do this without an event?
|
||||
will the event eventually come back? Ask Cray */
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc, gni_err_str[rc]));
|
||||
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
rc = GNI_GetCompleted (cq->gni_handle, args->event_data[i], &desc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
args->post_desc[i] = MCA_BTL_UGNI_DESC_TO_PDESC(desc);
|
||||
/* return the endpoint handle while we have the lock. see the explanation in
|
||||
* the documentation for mca_btl_ugni_ep_return_rdma() */
|
||||
if (OPAL_LIKELY(GNI_CQ_STATUS_OK(args->event_data[i]))) {
|
||||
/* the operation completed successfully. return the endpoint handle now. otherwise
|
||||
* we may still need the endpoint handle to start the repost(). */
|
||||
mca_btl_ugni_ep_return_rdma (args->post_desc[i]->ep_handle);
|
||||
args->post_desc[i]->ep_handle = NULL;
|
||||
}
|
||||
--cq->active_operations;
|
||||
}
|
||||
|
||||
return args->count;
|
||||
}
|
||||
|
||||
typedef struct mca_btl_ugni_get_datagram_args_t {
|
||||
mca_btl_ugni_module_t *ugni_module;
|
||||
gni_ep_handle_t *handle;
|
||||
@ -275,7 +399,7 @@ static inline intptr_t mca_btl_ugni_get_datagram_device (mca_btl_ugni_device_t *
|
||||
|
||||
if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) {
|
||||
*(args->ep) = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&args->ugni_module->endpoints, data);
|
||||
*(args->handle) = (*args->ep)->smsg_ep_handle->gni_handle;
|
||||
*(args->handle) = (*args->ep)->smsg_ep_handle.gni_handle;
|
||||
} else {
|
||||
*(args->handle) = args->ugni_module->wildcard_ep;
|
||||
}
|
||||
@ -336,11 +460,11 @@ static intptr_t mca_btl_ugni_dereg_mem_device (mca_btl_ugni_device_t *device, vo
|
||||
static inline int mca_btl_ugni_endpoint_smsg_send_wtag (mca_btl_base_endpoint_t *endpoint, void *hdr, size_t hdr_len,
|
||||
void *payload, size_t payload_len, uint32_t msg_id, int tag)
|
||||
{
|
||||
mca_btl_ugni_smsg_send_wtag_arg_t args = {.ep_handle = endpoint->smsg_ep_handle->gni_handle,
|
||||
mca_btl_ugni_smsg_send_wtag_arg_t args = {.ep_handle = endpoint->smsg_ep_handle.gni_handle,
|
||||
.hdr = hdr, .hdr_len = hdr_len, .payload = payload,
|
||||
.payload_len = payload_len, .msg_id = msg_id,
|
||||
.tag = tag};
|
||||
mca_btl_ugni_device_t *device = endpoint->smsg_ep_handle->device;
|
||||
mca_btl_ugni_device_t *device = endpoint->smsg_ep_handle.device;
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_smsg_send_wtag_device, &args);
|
||||
}
|
||||
|
||||
@ -367,6 +491,10 @@ static inline void mca_btl_ugni_cq_clear (mca_btl_ugni_device_t *device, gni_cq_
|
||||
static inline int mca_btl_ugni_cq_get_event (mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq, gni_cq_entry_t *event_data)
|
||||
{
|
||||
mca_btl_ugni_cq_get_event_args_t args = {.cq = cq, .event_data = event_data};
|
||||
/* NTH: normally there would be a check for any outstanding CQ operations but there seems
|
||||
* to be a reason to check the local SMSG completion queue anyway. since this function
|
||||
* only handled the SMSG local completion queue not checking here should be fine and
|
||||
* should not impact performance. */
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_cq_get_event_device, &args);
|
||||
}
|
||||
|
||||
@ -376,18 +504,34 @@ static inline int mca_btl_ugni_gni_cq_get_event (mca_btl_ugni_device_t *device,
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_gni_cq_get_event_device, &args);
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_endpoint_post_fma (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc)
|
||||
__opal_attribute_always_inline__
|
||||
static inline int mca_btl_ugni_endpoint_post (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc,
|
||||
mca_btl_ugni_device_serialize_fn_t post_fn)
|
||||
{
|
||||
struct mca_btl_ugni_post_device_args_t args = {.desc = desc};
|
||||
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint);
|
||||
mca_btl_ugni_device_t *device = desc->ep_handle ? desc->ep_handle->device : mca_btl_ugni_ep_get_device (ugni_module);
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_fma_device, desc);
|
||||
int rc;
|
||||
|
||||
/* use serialize_any as it is responsible for binding devices to threads (if enabled). this generally
|
||||
* gives better performance as it reduces contention on any individual device. */
|
||||
rc = mca_btl_ugni_device_serialize_any (ugni_module, post_fn, &args);
|
||||
if (args.count) {
|
||||
mca_btl_ugni_handle_rdma_completions (ugni_module, args.device, args.completed, args.count);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__
|
||||
static inline int mca_btl_ugni_endpoint_post_fma (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc)
|
||||
{
|
||||
return mca_btl_ugni_endpoint_post (endpoint, desc, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_fma_device);
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__
|
||||
static inline int mca_btl_ugni_endpoint_post_rdma (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc)
|
||||
{
|
||||
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint);
|
||||
mca_btl_ugni_device_t *device = desc->ep_handle ? desc->ep_handle->device : mca_btl_ugni_ep_get_device (ugni_module);
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_rdma_device, desc);
|
||||
return mca_btl_ugni_endpoint_post (endpoint, desc, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_rdma_device);
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_endpoint_post_cqwrite (mca_btl_ugni_endpoint_t *endpoint, mca_btl_ugni_post_descriptor_t *desc)
|
||||
@ -397,11 +541,16 @@ static inline int mca_btl_ugni_endpoint_post_cqwrite (mca_btl_ugni_endpoint_t *e
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_post_cqwrite_device, desc);
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__
|
||||
static inline int mca_btl_ugni_cq_get_completed_desc (mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq,
|
||||
gni_cq_entry_t *event_data, mca_btl_ugni_post_descriptor_t **post_desc,
|
||||
mca_btl_ugni_post_descriptor_t *post_desc,
|
||||
int count)
|
||||
{
|
||||
mca_btl_ugni_cq_get_completed_desc_arg_t args = {.cq = cq, .event_data = event_data, .post_desc = post_desc, .count = count};
|
||||
mca_btl_ugni_cq_get_completed_desc_arg_t args = {.cq = cq, .post_desc = post_desc, .count = count};
|
||||
if (0 == cq->active_operations) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (int) mca_btl_ugni_device_serialize (device, (mca_btl_ugni_device_serialize_fn_t) mca_btl_ugni_cq_get_completed_desc_device, &args);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
@ -156,7 +156,7 @@ int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnec
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
device = ep->smsg_ep_handle->device;
|
||||
device = ep->smsg_ep_handle.device;
|
||||
|
||||
while (device->dev_smsg_local_cq.active_operations) {
|
||||
/* ensure all sends are complete before removing and procs */
|
||||
@ -181,15 +181,14 @@ int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnec
|
||||
}
|
||||
} while (device->dev_smsg_local_cq.active_operations);
|
||||
|
||||
(void) opal_atomic_add_fetch_32 (&ep->smsg_ep_handle->device->smsg_connections, -1);
|
||||
(void) opal_atomic_add_fetch_32 (&ep->smsg_ep_handle.device->smsg_connections, -1);
|
||||
}
|
||||
|
||||
mca_btl_ugni_device_lock (device);
|
||||
|
||||
/* NTH: this call may not need the device lock. seems to work without it but
|
||||
* the lock is here to be safe. */
|
||||
(void) mca_btl_ugni_ep_handle_destroy (ep->smsg_ep_handle);
|
||||
ep->smsg_ep_handle = NULL;
|
||||
(void) mca_btl_ugni_ep_handle_cleanup (&ep->smsg_ep_handle);
|
||||
|
||||
mca_btl_ugni_device_unlock (device);
|
||||
|
||||
@ -221,10 +220,10 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
|
||||
/* bind endpoint to remote address */
|
||||
/* we bind two endpoints to seperate out local smsg completion and local fma completion */
|
||||
mca_btl_ugni_device_lock (device);
|
||||
ep->smsg_ep_handle = mca_btl_ugni_ep_handle_create (ep, device->dev_smsg_local_cq.gni_handle, device);
|
||||
rc = mca_btl_ugni_ep_handle_init (ep, device->dev_smsg_local_cq.gni_handle, device, &ep->smsg_ep_handle);
|
||||
mca_btl_ugni_device_unlock (device);
|
||||
if (OPAL_UNLIKELY(NULL == ep->smsg_ep_handle)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* build connection data */
|
||||
@ -262,7 +261,7 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
|
||||
ep->mailbox->attr.smsg_attr.mem_hndl.qword2, ep->mailbox->attr.smsg_attr.mbox_offset,
|
||||
ep->mailbox->attr.smsg_attr.mbox_maxcredit, ep->mailbox->attr.smsg_attr.msg_maxsize));
|
||||
|
||||
grc = GNI_SmsgInit (ep->smsg_ep_handle->gni_handle, &ep->mailbox->attr.smsg_attr,
|
||||
grc = GNI_SmsgInit (ep->smsg_ep_handle.gni_handle, &ep->mailbox->attr.smsg_attr,
|
||||
&ep->remote_attr->smsg_attr);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
BTL_ERROR(("error initializing SMSG protocol. rc = %d", grc));
|
||||
@ -274,11 +273,11 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
|
||||
* index on the remote peer. This makes lookup of endpoints on completion take
|
||||
* a single lookup in the endpoints array. we will not be able to change the
|
||||
* remote peer's index in the endpoint's array after this point. */
|
||||
GNI_EpSetEventData (ep->smsg_ep_handle->gni_handle, ep->index, ep->remote_attr->index);
|
||||
GNI_EpSetEventData (ep->smsg_ep_handle.gni_handle, ep->index, ep->remote_attr->index);
|
||||
|
||||
ep->rmt_irq_mem_hndl = ep->remote_attr->rmt_irq_mem_hndl;
|
||||
ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED;
|
||||
(void) opal_atomic_add_fetch_32 (&ep->smsg_ep_handle->device->smsg_connections, 1);
|
||||
(void) opal_atomic_add_fetch_32 (&ep->smsg_ep_handle.device->smsg_connections, 1);
|
||||
|
||||
/* send all pending messages */
|
||||
BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->frag_wait_list)));
|
||||
@ -308,7 +307,7 @@ static int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep)
|
||||
/* the irq cq is associated with only the first device */
|
||||
ep->mailbox->attr.rmt_irq_mem_hndl = ugni_module->devices->smsg_irq_mhndl;
|
||||
|
||||
rc = GNI_EpPostDataWId (ep->smsg_ep_handle->gni_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr),
|
||||
rc = GNI_EpPostDataWId (ep->smsg_ep_handle.gni_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr),
|
||||
ep->remote_attr, sizeof (*ep->remote_attr),
|
||||
MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->index);
|
||||
if (OPAL_LIKELY(GNI_RC_SUCCESS == rc)) {
|
||||
@ -372,45 +371,11 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep)
|
||||
return mca_btl_ugni_ep_connect_finish (ep);
|
||||
}
|
||||
|
||||
int mca_btl_ugni_endpoint_handle_init_rdma (opal_free_list_item_t *item, void *ctx)
|
||||
int mca_btl_ugni_ep_handle_init (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq,
|
||||
mca_btl_ugni_device_t *device, mca_btl_ugni_endpoint_handle_t *ep_handle)
|
||||
{
|
||||
mca_btl_ugni_endpoint_handle_t *handle = (mca_btl_ugni_endpoint_handle_t *) item;
|
||||
mca_btl_ugni_device_t *device = (mca_btl_ugni_device_t *) ctx;
|
||||
gni_return_t grc;
|
||||
|
||||
grc = GNI_EpCreate (device->dev_handle, device->dev_rdma_local_cq.gni_handle, &handle->gni_handle);
|
||||
handle->device = device;
|
||||
return mca_btl_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_endpoint_handle_construct (mca_btl_ugni_endpoint_handle_t *handle)
|
||||
{
|
||||
handle->gni_handle = 0;
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_endpoint_handle_destruct (mca_btl_ugni_endpoint_handle_t *handle)
|
||||
{
|
||||
if (handle->gni_handle) {
|
||||
GNI_EpDestroy (handle->gni_handle);
|
||||
handle->gni_handle = 0;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_endpoint_handle_t, opal_object_t,
|
||||
mca_btl_ugni_endpoint_handle_construct,
|
||||
mca_btl_ugni_endpoint_handle_destruct);
|
||||
|
||||
mca_btl_ugni_endpoint_handle_t *mca_btl_ugni_ep_handle_create (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq,
|
||||
mca_btl_ugni_device_t *device)
|
||||
{
|
||||
mca_btl_ugni_endpoint_handle_t *ep_handle;
|
||||
gni_return_t grc;
|
||||
|
||||
ep_handle = OBJ_NEW(mca_btl_ugni_endpoint_handle_t);
|
||||
if (OPAL_UNLIKELY(NULL == ep_handle)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ep_handle->device = device;
|
||||
|
||||
/* create a uGNI endpoint handle and bind it to the remote peer */
|
||||
@ -419,19 +384,14 @@ mca_btl_ugni_endpoint_handle_t *mca_btl_ugni_ep_handle_create (mca_btl_ugni_endp
|
||||
grc = GNI_EpBind (ep_handle->gni_handle, ep->ep_rem_addr, ep->ep_rem_id);
|
||||
}
|
||||
|
||||
if (GNI_RC_SUCCESS != grc) {
|
||||
OBJ_RELEASE(ep_handle);
|
||||
ep_handle = NULL;
|
||||
}
|
||||
|
||||
return ep_handle;
|
||||
return mca_btl_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
int mca_btl_ugni_ep_handle_destroy (mca_btl_ugni_endpoint_handle_t *ep_handle)
|
||||
int mca_btl_ugni_ep_handle_cleanup (mca_btl_ugni_endpoint_handle_t *ep_handle)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (NULL == ep_handle || 0 == ep_handle->gni_handle) {
|
||||
if (0 == ep_handle->gni_handle) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -439,9 +399,11 @@ int mca_btl_ugni_ep_handle_destroy (mca_btl_ugni_endpoint_handle_t *ep_handle)
|
||||
rc = GNI_EpUnbind (ep_handle->gni_handle);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
/* should warn */
|
||||
} else {
|
||||
(void) GNI_EpDestroy (ep_handle->gni_handle);
|
||||
}
|
||||
|
||||
OBJ_RELEASE(ep_handle);
|
||||
ep_handle->gni_handle = 0;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -25,13 +25,11 @@ typedef enum mca_btl_ugni_endpoint_state_t mca_btl_ugni_endpoint_state_t;
|
||||
struct mca_btl_ugni_smsg_mbox_t;
|
||||
|
||||
struct mca_btl_ugni_endpoint_handle_t {
|
||||
opal_free_list_item_t super;
|
||||
mca_btl_ugni_device_t *device;
|
||||
gni_ep_handle_t gni_handle;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ugni_endpoint_handle_t mca_btl_ugni_endpoint_handle_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_endpoint_handle_t);
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
@ -51,7 +49,7 @@ typedef struct mca_btl_base_endpoint_t {
|
||||
uint32_t ep_rem_id;
|
||||
|
||||
/** endpoint to use for SMSG messages */
|
||||
mca_btl_ugni_endpoint_handle_t *smsg_ep_handle;
|
||||
mca_btl_ugni_endpoint_handle_t smsg_ep_handle;
|
||||
|
||||
/** temporary space to store the remote SMSG attributes */
|
||||
mca_btl_ugni_endpoint_attr_t *remote_attr;
|
||||
@ -132,73 +130,16 @@ static inline mca_btl_ugni_module_t *mca_btl_ugni_ep_btl (mca_btl_ugni_endpoint_
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate and bind a uGNI endpoint handle to the remote peer.
|
||||
* Initialize and bind an endpoint handle
|
||||
*
|
||||
* @param[in] ep BTL endpoint
|
||||
* @param[in] cq completion queue
|
||||
* @param[out] ep_handle uGNI endpoint handle
|
||||
* @param[in] device device to bind with
|
||||
* @param[in] ep_handle endpoint handle to initialize and bind
|
||||
*/
|
||||
mca_btl_ugni_endpoint_handle_t *mca_btl_ugni_ep_handle_create (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq,
|
||||
mca_btl_ugni_device_t *device);
|
||||
int mca_btl_ugni_ep_handle_init (mca_btl_ugni_endpoint_t *ep, gni_cq_handle_t cq,
|
||||
mca_btl_ugni_device_t *device, mca_btl_ugni_endpoint_handle_t *ep_handle);
|
||||
|
||||
/**
|
||||
* Unbind and free the uGNI endpoint handle.
|
||||
*
|
||||
* @param[in] ep_handle uGNI endpoint handle to unbind and release
|
||||
*/
|
||||
int mca_btl_ugni_ep_handle_destroy (mca_btl_ugni_endpoint_handle_t *ep_handle);
|
||||
|
||||
/**
|
||||
* Free list initialization function for endpoint handles (DO NOT CALL outside free list)
|
||||
*
|
||||
* @param[in] item Free list item to initialize
|
||||
* @param[in] ctx Free list context
|
||||
*
|
||||
* @returns OPAL_SUCCESS on success
|
||||
* @returns OPAL error code on error
|
||||
*/
|
||||
int mca_btl_ugni_endpoint_handle_init_rdma (opal_free_list_item_t *item, void *ctx);
|
||||
|
||||
/**
|
||||
* @brief get an endpoint handle from a device's free list
|
||||
*
|
||||
* @param[in] ep btl endpoint
|
||||
* @param[in] device btl device to use
|
||||
*
|
||||
* This function MUST be called with the device lock held. This was done over using
|
||||
* the atomic free list to avoid unnecessary atomics in the critical path.
|
||||
*/
|
||||
static inline mca_btl_ugni_endpoint_handle_t *
|
||||
mca_btl_ugni_ep_get_rdma (mca_btl_ugni_endpoint_t *ep, mca_btl_ugni_device_t *device)
|
||||
{
|
||||
mca_btl_ugni_endpoint_handle_t *ep_handle;
|
||||
gni_return_t grc;
|
||||
|
||||
ep_handle = (mca_btl_ugni_endpoint_handle_t *) opal_free_list_get_st (&device->endpoints);
|
||||
if (OPAL_UNLIKELY(NULL == ep_handle)) {
|
||||
return NULL;
|
||||
}
|
||||
grc = GNI_EpBind (ep_handle->gni_handle, ep->ep_rem_addr, ep->ep_rem_id | device->dev_index);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
opal_free_list_return_st (&device->endpoints, &ep_handle->super);
|
||||
ep_handle = NULL;
|
||||
}
|
||||
|
||||
return ep_handle;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief return an endpoint handle to a device's free list
|
||||
*
|
||||
* @param[in] ep_handle endpoint handle to return
|
||||
*
|
||||
* This function MUST be called with the device lock held. This was done over using
|
||||
* the atomic free list to avoid unnecessary atomics in the critical path. If
|
||||
*/
|
||||
static inline void mca_btl_ugni_ep_return_rdma (mca_btl_ugni_endpoint_handle_t *ep_handle)
|
||||
{
|
||||
(void) GNI_EpUnbind (ep_handle->gni_handle);
|
||||
opal_free_list_return_st (&ep_handle->device->endpoints, &ep_handle->super);
|
||||
}
|
||||
int mca_btl_ugni_ep_handle_cleanup (mca_btl_ugni_endpoint_handle_t *ep_handle);
|
||||
|
||||
#endif /* MCA_BTL_UGNI_ENDPOINT_H */
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -41,12 +41,41 @@ OBJ_CLASS_INSTANCE(mca_btl_ugni_eager_frag_t, mca_btl_base_descriptor_t,
|
||||
static void mca_btl_ugni_post_descriptor_constructor (mca_btl_ugni_post_descriptor_t *desc)
|
||||
{
|
||||
desc->cq = NULL;
|
||||
desc->ep_handle = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_post_descriptor_t, opal_free_list_item_t,
|
||||
mca_btl_ugni_post_descriptor_constructor, NULL);
|
||||
|
||||
static void mca_btl_ugni_rdma_desc_constructor (mca_btl_ugni_rdma_desc_t *desc)
|
||||
{
|
||||
desc->device = NULL;
|
||||
desc->gni_handle = 0;
|
||||
desc->tries = 0;
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_rdma_desc_destructor (mca_btl_ugni_rdma_desc_t *desc)
|
||||
{
|
||||
if (0 != desc->gni_handle) {
|
||||
(void) GNI_EpDestroy (desc->gni_handle);
|
||||
desc->gni_handle = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int mca_btl_ugni_rdma_desc_init (opal_free_list_item_t *item, void *ctx)
|
||||
{
|
||||
mca_btl_ugni_rdma_desc_t *rdma_desc = (mca_btl_ugni_rdma_desc_t *) item;
|
||||
mca_btl_ugni_device_t *device = (mca_btl_ugni_device_t *) ctx;
|
||||
gni_return_t grc;
|
||||
|
||||
grc = GNI_EpCreate (device->dev_handle, device->dev_rdma_local_cq.gni_handle, &rdma_desc->gni_handle);
|
||||
rdma_desc->device = device;
|
||||
return mca_btl_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_desc_t, opal_free_list_item_t,
|
||||
mca_btl_ugni_rdma_desc_constructor, mca_btl_ugni_rdma_desc_destructor);
|
||||
|
||||
int mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, void *id)
|
||||
{
|
||||
/* NTH: the id is a combination of the module id and the free list id. for now there
|
||||
|
@ -19,6 +19,8 @@
|
||||
#include "btl_ugni.h"
|
||||
#include "btl_ugni_endpoint.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
typedef struct mca_btl_ugni_send_frag_hdr_t {
|
||||
uint32_t lag;
|
||||
} mca_btl_ugni_send_frag_hdr_t;
|
||||
@ -83,59 +85,99 @@ typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t;
|
||||
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_rdma_frag_t;
|
||||
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_eager_frag_t;
|
||||
|
||||
#define MCA_BTL_UGNI_DESC_TO_FRAG(desc) \
|
||||
((mca_btl_ugni_base_frag_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_base_frag_t, post_desc)))
|
||||
|
||||
typedef struct mca_btl_ugni_post_descriptor_t {
|
||||
opal_free_list_item_t super;
|
||||
gni_post_descriptor_t desc;
|
||||
mca_btl_ugni_endpoint_handle_t *ep_handle;
|
||||
/** endpoint currently associated with this desctiptor */
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
/** local memory handle (for callback) */
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc;
|
||||
/** currently associated completion queue */
|
||||
mca_btl_ugni_cq_t *cq;
|
||||
/** user callback function */
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc;
|
||||
/** user callback data */
|
||||
void *cbdata;
|
||||
/** user callback context */
|
||||
void *ctx;
|
||||
int tries;
|
||||
/** opal status of this descriptor. filled in by
|
||||
* mca_btl_ugni_cq_get_completed_desc_device() */
|
||||
int rc;
|
||||
/** true if posted with the BTE. false if FMA. this is used as part
|
||||
* of the BTE throttling code. */
|
||||
bool use_bte;
|
||||
/** uGNI library post descriptor. this is last in this structure
|
||||
* to try to keep it hot in the cache after copying this descriptor
|
||||
* into the allocated descritor. (post follows almost immediately
|
||||
* after allocate. */
|
||||
gni_post_descriptor_t gni_desc;
|
||||
} mca_btl_ugni_post_descriptor_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_post_descriptor_t);
|
||||
|
||||
#define MCA_BTL_UGNI_DESC_TO_PDESC(desc) \
|
||||
((mca_btl_ugni_post_descriptor_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_post_descriptor_t, desc)))
|
||||
typedef struct mca_btl_ugni_rdma_desc_t {
|
||||
opal_free_list_item_t super;
|
||||
mca_btl_ugni_post_descriptor_t btl_ugni_desc;
|
||||
mca_btl_ugni_device_t *device;
|
||||
gni_ep_handle_t gni_handle;
|
||||
int tries;
|
||||
} mca_btl_ugni_rdma_desc_t;
|
||||
|
||||
static inline mca_btl_ugni_post_descriptor_t *
|
||||
mca_btl_ugni_alloc_post_descriptor (mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_rdma_desc_t);
|
||||
|
||||
#define MCA_BTL_UGNI_GNI_DESC_TO_RDMA_DESC(desc) \
|
||||
((mca_btl_ugni_rdma_desc_t *) ((uintptr_t)(desc) - offsetof (mca_btl_ugni_rdma_desc_t, btl_ugni_desc) - offsetof (mca_btl_ugni_post_descriptor_t, gni_desc)))
|
||||
|
||||
/**
|
||||
* Initialize a RDMA descriptor
|
||||
*
|
||||
* @param[in] item free list item (must be of class mca_btl_ugni_rdma_desc_t)
|
||||
* @param[in] ctx pointer to ugni device context
|
||||
*
|
||||
* This function initializes a mca_btl_ugni_rdma_desc_t for use. It allocates
|
||||
* resources from the ugni library. This must be called before a RDMA
|
||||
* descriptor can be used. Usually this is passed as an argument to
|
||||
* opal_free_list_init().
|
||||
*/
|
||||
int mca_btl_ugni_rdma_desc_init (opal_free_list_item_t *item, void *ctx);
|
||||
|
||||
/**
|
||||
* @brief get an endpoint handle from a device's free list
|
||||
*
|
||||
* @param[in] ep btl endpoint
|
||||
* @param[in] device btl device to use
|
||||
* @param[in] use_bte whether this descriptor will be used with the BTE
|
||||
*
|
||||
* This function MUST be called with the device lock held. This was done over using
|
||||
* the atomic free list to avoid unnecessary atomics in the critical path.
|
||||
*/
|
||||
static inline mca_btl_ugni_rdma_desc_t *
|
||||
mca_btl_ugni_alloc_rdma_desc (mca_btl_ugni_device_t *device, mca_btl_ugni_post_descriptor_t *ugni_desc, const bool use_bte)
|
||||
{
|
||||
/* mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (endpoint); */
|
||||
mca_btl_ugni_post_descriptor_t *desc;
|
||||
mca_btl_ugni_rdma_desc_t *desc = (mca_btl_ugni_rdma_desc_t *) opal_free_list_get_st (&device->rdma_descs);
|
||||
mca_btl_ugni_endpoint_t *ep = ugni_desc->endpoint;
|
||||
gni_return_t grc;
|
||||
|
||||
desc = OBJ_NEW(mca_btl_ugni_post_descriptor_t);
|
||||
/* (mca_btl_ugni_post_descriptor_t *) opal_free_list_get (&ugni_module->post_descriptors); */
|
||||
if (OPAL_UNLIKELY(NULL != desc)) {
|
||||
desc->cbfunc = cbfunc;
|
||||
desc->ctx = cbcontext;
|
||||
desc->cbdata = cbdata;
|
||||
desc->local_handle = local_handle;
|
||||
desc->endpoint = endpoint;
|
||||
if (OPAL_LIKELY(NULL != desc)) {
|
||||
grc = GNI_EpBind (desc->gni_handle, ep->ep_rem_addr, ep->ep_rem_id | device->dev_index);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
opal_free_list_return_st (&device->rdma_descs, &desc->super);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
desc->device = device;
|
||||
desc->tries = 0;
|
||||
desc->btl_ugni_desc = *ugni_desc;
|
||||
desc->btl_ugni_desc.use_bte = use_bte;
|
||||
}
|
||||
|
||||
return desc;
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_return_post_descriptor (mca_btl_ugni_post_descriptor_t *desc)
|
||||
static inline void mca_btl_ugni_return_rdma_desc (mca_btl_ugni_rdma_desc_t *desc)
|
||||
{
|
||||
/* mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (desc->endpoint); */
|
||||
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (desc->btl_ugni_desc.endpoint);
|
||||
|
||||
if (NULL != desc->ep_handle) {
|
||||
mca_btl_ugni_ep_return_rdma (desc->ep_handle);
|
||||
/* desc->ep_handle = NULL; */
|
||||
}
|
||||
|
||||
/* desc->cq = NULL; */
|
||||
/* opal_free_list_return (&ugni_module->post_descriptors, &desc->super); */
|
||||
free (desc);
|
||||
(void) GNI_EpUnbind (desc->gni_handle);
|
||||
opal_free_list_return_st (&desc->device->rdma_descs, &desc->super);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *module, mca_btl_ugni_post_descriptor_t *desc, int rc)
|
||||
@ -144,12 +186,9 @@ static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *modul
|
||||
|
||||
if (NULL != desc->cbfunc) {
|
||||
/* call the user's callback function */
|
||||
desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->desc.local_addr,
|
||||
desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->gni_desc.local_addr,
|
||||
desc->local_handle, desc->ctx, desc->cbdata, rc);
|
||||
}
|
||||
|
||||
/* the descriptor is no longer needed */
|
||||
mca_btl_ugni_return_post_descriptor (desc);
|
||||
}
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_frag_t);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
@ -16,11 +16,13 @@
|
||||
|
||||
#include "btl_ugni.h"
|
||||
#include "btl_ugni_endpoint.h"
|
||||
#include "btl_ugni_frag.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
static inline int get_ptag(uint8_t *out_ptag)
|
||||
{
|
||||
@ -60,7 +62,6 @@ static inline int get_cookie (uint32_t *out_cookie)
|
||||
}
|
||||
|
||||
*out_cookie = tmp_cookie;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -115,18 +116,7 @@ int mca_btl_ugni_device_init (mca_btl_ugni_device_t *device, int virtual_device_
|
||||
uint32_t dev_pe_addr;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&device->endpoints, opal_free_list_t);
|
||||
OBJ_CONSTRUCT(&device->pending_post, opal_list_t);
|
||||
|
||||
rc = opal_free_list_init (&device->endpoints, sizeof (mca_btl_ugni_endpoint_handle_t),
|
||||
8, OBJ_CLASS(mca_btl_ugni_endpoint_handle_t), 0, 8, 0,
|
||||
mca_btl_ugni_component.local_cq_size, 16,
|
||||
NULL, 0, NULL, mca_btl_ugni_endpoint_handle_init_rdma,
|
||||
(void *) device);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
OBJ_DESTRUCT(&device->endpoints);
|
||||
return rc;
|
||||
}
|
||||
OBJ_CONSTRUCT(&device->rdma_descs, opal_free_list_t);
|
||||
|
||||
/* create a communication domain */
|
||||
rc = GNI_CdmCreate (mca_btl_ugni_component.cdm_id_base | virtual_device_id, mca_btl_ugni_component.ptag,
|
||||
@ -149,13 +139,23 @@ int mca_btl_ugni_device_init (mca_btl_ugni_device_t *device, int virtual_device_
|
||||
return mca_btl_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
rc = opal_free_list_init (&device->rdma_descs, sizeof (mca_btl_ugni_rdma_desc_t),
|
||||
64, OBJ_CLASS(mca_btl_ugni_rdma_desc_t), 0, 8, 0,
|
||||
mca_btl_ugni_component.local_rdma_cq_size, 32,
|
||||
NULL, 0, NULL, mca_btl_ugni_rdma_desc_init, (void *) device);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
OBJ_DESTRUCT(&device->rdma_descs);
|
||||
return rc;
|
||||
}
|
||||
|
||||
device->lock = 0;
|
||||
device->dev_rdma_local_cq.gni_handle = 0;
|
||||
device->dev_rdma_local_cq.active_operations = 0;
|
||||
device->dev_rdma_local_irq_cq.gni_handle = 0;
|
||||
device->dev_rdma_local_irq_cq.active_operations = 0;
|
||||
device->dev_smsg_local_cq.gni_handle = 0;
|
||||
device->dev_smsg_local_cq.active_operations= 0;
|
||||
device->dev_smsg_local_cq.active_operations = 0;
|
||||
device->flushed = true;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -164,8 +164,7 @@ int mca_btl_ugni_device_fini (mca_btl_ugni_device_t *dev)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OBJ_DESTRUCT(&dev->endpoints);
|
||||
OBJ_DESTRUCT(&dev->pending_post);
|
||||
OBJ_DESTRUCT(&dev->rdma_descs);
|
||||
|
||||
if (0 != dev->dev_rdma_local_cq.gni_handle) {
|
||||
GNI_CqDestroy (dev->dev_rdma_local_cq.gni_handle);
|
||||
@ -243,27 +242,46 @@ int mca_btl_ugni_init (void)
|
||||
FILE *fh;
|
||||
|
||||
if (0 == mca_btl_ugni_component.virtual_device_count) {
|
||||
/* XXX -- TODO -- might want to improve this logic. One option would be to
|
||||
* compare the number of local peers vs the number of cores or hyperthreads
|
||||
* on the node. */
|
||||
int core_count;
|
||||
|
||||
if (!opal_using_threads() || opal_process_info.num_local_peers >= 255) {
|
||||
(void) opal_hwloc_base_get_topology ();
|
||||
core_count = hwloc_get_nbobjs_by_type (opal_hwloc_topology, HWLOC_OBJ_CORE);
|
||||
|
||||
if (core_count <= opal_process_info.num_local_peers || !opal_using_threads()) {
|
||||
/* there is probably no benefit to using multiple device contexts when not
|
||||
* using threads. */
|
||||
mca_btl_ugni_component.virtual_device_count = 1;
|
||||
} else if (opal_process_info.num_local_peers >= 127) {
|
||||
mca_btl_ugni_component.virtual_device_count = 2;
|
||||
} else if (opal_process_info.num_local_peers >= 63) {
|
||||
mca_btl_ugni_component.virtual_device_count = 4;
|
||||
} else if (opal_process_info.num_local_peers >= 31) {
|
||||
mca_btl_ugni_component.virtual_device_count = 8;
|
||||
} else {
|
||||
mca_btl_ugni_component.virtual_device_count = 16;
|
||||
mca_btl_ugni_component.virtual_device_count = core_count / (opal_process_info.num_local_peers + 1);
|
||||
}
|
||||
} else if (MCA_BTL_UGNI_MAX_DEV_HANDLES < mca_btl_ugni_component.virtual_device_count) {
|
||||
}
|
||||
|
||||
if (MCA_BTL_UGNI_MAX_DEV_HANDLES < mca_btl_ugni_component.virtual_device_count) {
|
||||
mca_btl_ugni_component.virtual_device_count = MCA_BTL_UGNI_MAX_DEV_HANDLES;
|
||||
}
|
||||
|
||||
if (0 == mca_btl_ugni_component.local_rdma_cq_size) {
|
||||
if (1 == mca_btl_ugni_component.virtual_device_count) {
|
||||
mca_btl_ugni_component.local_rdma_cq_size = 2048;
|
||||
} else {
|
||||
mca_btl_ugni_component.local_rdma_cq_size = 256;
|
||||
}
|
||||
}
|
||||
|
||||
if ((mca_btl_ugni_component.virtual_device_count * (1 + opal_process_info.num_local_peers)) < 122) {
|
||||
/* if there are fewer total devices than FMA descriptors it makes sense to turn off FMA sharing.
|
||||
* *DO NOT* override a user requested flag. */
|
||||
mca_base_var_source_t source = MCA_BASE_VAR_SOURCE_DEFAULT;
|
||||
|
||||
mca_base_var_get_value (mca_btl_ugni_component.cdm_flags_id, NULL, &source, NULL);
|
||||
if (MCA_BASE_VAR_SOURCE_DEFAULT == source) {
|
||||
BTL_VERBOSE(("disabling shared FMA sharing"));
|
||||
|
||||
mca_btl_ugni_component.cdm_flags &= ~GNI_CDM_MODE_FMA_SHARED;
|
||||
mca_btl_ugni_component.cdm_flags |= GNI_CDM_MODE_FMA_DEDICATED;
|
||||
}
|
||||
}
|
||||
|
||||
fh = fopen ("/proc/sys/kernel/pid_max", "r");
|
||||
if (NULL != fh) {
|
||||
fscanf (fh, "%d", &pid_max);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||
@ -59,6 +59,7 @@ mca_btl_ugni_module_t mca_btl_ugni_module = {
|
||||
.btl_atomic_op = mca_btl_ugni_aop,
|
||||
.btl_atomic_fop = mca_btl_ugni_afop,
|
||||
.btl_atomic_cswap = mca_btl_ugni_acswap,
|
||||
.btl_flush = mca_btl_ugni_flush,
|
||||
}
|
||||
};
|
||||
|
||||
@ -85,8 +86,8 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
|
||||
|
||||
ugni_module->initialized = false;
|
||||
ugni_module->nlocal_procs = 0;
|
||||
ugni_module->connected_peer_count = 0;
|
||||
ugni_module->active_datagrams = 0;
|
||||
ugni_module->active_rdma_count = 0;
|
||||
|
||||
opal_event_evtimer_set (opal_sync_event_base, &ugni_module->connection_event,
|
||||
mca_btl_ugni_datagram_event, ugni_module);
|
||||
@ -109,7 +110,6 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
|
||||
OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->post_descriptors, opal_free_list_t);
|
||||
|
||||
/* set up virtual device handles */
|
||||
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
|
||||
@ -325,3 +325,38 @@ static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_event_fatal_error (gni_return_t grc, gni_cq_entry_t event_data)
|
||||
{
|
||||
/* combined error check for get event and get completed. we might miss exactly
|
||||
* what happened but it is unrecoverable anyway. fwiw, this error path has
|
||||
* never been seen in production. */
|
||||
if (GNI_CQ_OVERRUN(event_data)) {
|
||||
/* TODO -- need to handle overrun -- how do we do this without an event?
|
||||
will the event eventually come back? Ask Cray */
|
||||
BTL_ERROR(("CQ overrun detected in RDMA event data. can not recover"));
|
||||
} else {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
|
||||
}
|
||||
|
||||
return mca_btl_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
int mca_btl_ugni_device_handle_event_error (mca_btl_ugni_rdma_desc_t *rdma_desc, gni_cq_entry_t event_data)
|
||||
{
|
||||
mca_btl_ugni_device_t *device = rdma_desc->device;
|
||||
uint32_t recoverable = 1;
|
||||
|
||||
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
|
||||
|
||||
if (OPAL_UNLIKELY(++rdma_desc->tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) {
|
||||
char char_buffer[1024];
|
||||
GNI_CqErrorStr (event_data, char_buffer, sizeof (char_buffer));
|
||||
|
||||
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", rdma_desc, recoverable, char_buffer));
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
return _mca_btl_ugni_repost_rdma_desc_device (device, rdma_desc);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -21,31 +21,36 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
|
||||
mca_btl_ugni_base_frag_t *frag);
|
||||
|
||||
static inline void init_gni_post_desc (mca_btl_ugni_post_descriptor_t *post_desc,
|
||||
int order, gni_post_type_t op_type,
|
||||
uint64_t lcl_addr,
|
||||
gni_mem_handle_t lcl_mdh,
|
||||
uint64_t rem_addr,
|
||||
gni_mem_handle_t rem_mdh,
|
||||
uint64_t bufsize,
|
||||
gni_cq_handle_t cq_hndl) {
|
||||
post_desc->desc.type = op_type;
|
||||
post_desc->desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
static inline void init_post_desc (mca_btl_ugni_post_descriptor_t *post_desc,
|
||||
mca_btl_base_endpoint_t *endpoint, int order,
|
||||
gni_post_type_t op_type, uint64_t lcl_addr,
|
||||
gni_mem_handle_t lcl_mdh, uint64_t rem_addr,
|
||||
gni_mem_handle_t rem_mdh, uint64_t bufsize,
|
||||
gni_cq_handle_t cq_hndl, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata,
|
||||
mca_btl_base_registration_handle_t *local_handle) {
|
||||
post_desc->endpoint = endpoint;
|
||||
post_desc->cbfunc = cbfunc;
|
||||
post_desc->ctx = cbcontext;
|
||||
post_desc->cbdata = cbdata;
|
||||
post_desc->local_handle = local_handle;
|
||||
post_desc->gni_desc.type = op_type;
|
||||
post_desc->gni_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
if (MCA_BTL_NO_ORDER == order) {
|
||||
post_desc->desc.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
|
||||
post_desc->gni_desc.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
|
||||
} else {
|
||||
post_desc->desc.dlvr_mode = GNI_DLVMODE_NO_ADAPT;
|
||||
post_desc->gni_desc.dlvr_mode = GNI_DLVMODE_NO_ADAPT;
|
||||
}
|
||||
post_desc->desc.local_addr = (uint64_t) lcl_addr;
|
||||
post_desc->desc.local_mem_hndl = lcl_mdh;
|
||||
post_desc->desc.remote_addr = (uint64_t) rem_addr;
|
||||
post_desc->desc.remote_mem_hndl = rem_mdh;
|
||||
post_desc->desc.length = bufsize;
|
||||
post_desc->desc.rdma_mode = 0;
|
||||
post_desc->desc.src_cq_hndl = cq_hndl;
|
||||
post_desc->tries = 0;
|
||||
post_desc->gni_desc.local_addr = (uint64_t) lcl_addr;
|
||||
post_desc->gni_desc.local_mem_hndl = lcl_mdh;
|
||||
post_desc->gni_desc.remote_addr = (uint64_t) rem_addr;
|
||||
post_desc->gni_desc.remote_mem_hndl = rem_mdh;
|
||||
post_desc->gni_desc.length = bufsize;
|
||||
post_desc->gni_desc.rdma_mode = 0;
|
||||
post_desc->gni_desc.src_cq_hndl = cq_hndl;
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__
|
||||
static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type,
|
||||
size_t size, void *local_address, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
@ -53,30 +58,20 @@ static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoin
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
mca_btl_ugni_post_descriptor_t post_desc;
|
||||
gni_mem_handle_t local_gni_handle = {0, 0};
|
||||
int rc;
|
||||
|
||||
if (local_handle) {
|
||||
local_gni_handle = local_handle->gni_handle;
|
||||
}
|
||||
|
||||
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Post descriptor (CQ is ignored for FMA transactions) -- The CQ associated with the endpoint
|
||||
* is used. */
|
||||
init_gni_post_desc (post_desc, order, op_type, (intptr_t) local_address, local_gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0);
|
||||
init_post_desc (&post_desc, endpoint, order, op_type, (intptr_t) local_address, local_gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0, cbfunc, cbcontext, cbdata,
|
||||
local_handle);
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_return_post_descriptor (post_desc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type,
|
||||
@ -86,21 +81,26 @@ static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
mca_btl_ugni_module_t *module = mca_btl_ugni_ep_btl (endpoint);
|
||||
mca_btl_ugni_post_descriptor_t post_desc;
|
||||
int rc;
|
||||
|
||||
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
/* There is a performance benefit to throttling the total number of active BTE tranactions. Not sure
|
||||
* what the optimium is but the limit is inforced as a soft limit. */
|
||||
if (module->active_rdma_count >= mca_btl_ugni_component.active_rdma_threshold) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Post descriptor */
|
||||
init_gni_post_desc (post_desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0);
|
||||
(void) OPAL_THREAD_FETCH_ADD32 (&module->active_rdma_count, 1);
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_rdma (endpoint, post_desc);
|
||||
/* Post descriptor */
|
||||
init_post_desc (&post_desc, endpoint, order, op_type, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, size, 0, cbfunc, cbcontext, cbdata,
|
||||
local_handle);
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_rdma (endpoint, &post_desc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_ugni_return_post_descriptor (post_desc);
|
||||
(void) OPAL_THREAD_FETCH_ADD32 (&module->active_rdma_count, -1);
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -111,31 +111,20 @@ static inline int mca_btl_ugni_post_cqwrite (mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
mca_btl_ugni_post_descriptor_t post_desc;
|
||||
|
||||
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
post_desc.gni_desc.type = GNI_POST_CQWRITE;
|
||||
post_desc.gni_desc.cqwrite_value = value; /* up to 48 bytes here, not used for now */
|
||||
post_desc.gni_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
post_desc.gni_desc.dlvr_mode = GNI_DLVMODE_IN_ORDER;
|
||||
post_desc.gni_desc.src_cq_hndl = cq->gni_handle;
|
||||
post_desc.gni_desc.remote_mem_hndl = irq_mhndl;
|
||||
post_desc.cq = cq;
|
||||
|
||||
post_desc->desc.type = GNI_POST_CQWRITE;
|
||||
post_desc->desc.cqwrite_value = value; /* up to 48 bytes here, not used for now */
|
||||
post_desc->desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
|
||||
post_desc->desc.dlvr_mode = GNI_DLVMODE_IN_ORDER;
|
||||
post_desc->desc.src_cq_hndl = cq->gni_handle;
|
||||
post_desc->desc.remote_mem_hndl = irq_mhndl;
|
||||
post_desc->tries = 0;
|
||||
post_desc->cq = cq;
|
||||
|
||||
rc = mca_btl_ugni_endpoint_post_cqwrite (endpoint, post_desc);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { /* errors for PostCqWrite treated as non-fatal */
|
||||
mca_btl_ugni_return_post_descriptor (post_desc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
return mca_btl_ugni_endpoint_post_cqwrite (endpoint, &post_desc);
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__
|
||||
static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, size_t size,
|
||||
void *local_address, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
@ -145,8 +134,10 @@ static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get,
|
||||
{
|
||||
const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET};
|
||||
const gni_post_type_t rdma_ops[2] = {GNI_POST_RDMA_PUT, GNI_POST_RDMA_GET};
|
||||
const long int fma_limit = get ? mca_btl_ugni_component.ugni_fma_get_limit :
|
||||
mca_btl_ugni_component.ugni_fma_put_limit;
|
||||
|
||||
if (size <= mca_btl_ugni_component.ugni_fma_limit) {
|
||||
if (size <= fma_limit) {
|
||||
return mca_btl_ugni_post_fma (endpoint, fma_ops[get], size, local_address, remote_address,
|
||||
local_handle, remote_handle, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
@ -155,13 +146,4 @@ static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get,
|
||||
local_handle, remote_handle, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_repost (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_post_descriptor_t *post_desc)
|
||||
{
|
||||
if (GNI_POST_RDMA_PUT == post_desc->desc.type || GNI_POST_RDMA_GET == post_desc->desc.type) {
|
||||
return mca_btl_ugni_endpoint_post_rdma (post_desc->endpoint, post_desc);
|
||||
}
|
||||
|
||||
return mca_btl_ugni_endpoint_post_fma (post_desc->endpoint, post_desc);
|
||||
}
|
||||
|
||||
#endif /* MCA_BTL_UGNI_RDMA_H */
|
||||
|
@ -74,7 +74,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
||||
do {
|
||||
uint8_t tag = GNI_SMSG_ANY_TAG;
|
||||
|
||||
rc = mca_btl_ugni_smsg_get_next_wtag (ep->smsg_ep_handle, &data_ptr, &tag);
|
||||
rc = mca_btl_ugni_smsg_get_next_wtag (&ep->smsg_ep_handle, &data_ptr, &tag);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
if (OPAL_LIKELY(GNI_RC_NOT_DONE == rc)) {
|
||||
BTL_VERBOSE(("no smsg message waiting. rc = %s", gni_err_str[rc]));
|
||||
@ -138,7 +138,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
||||
break;
|
||||
}
|
||||
|
||||
rc = mca_btl_ugni_smsg_release (ep->smsg_ep_handle);
|
||||
rc = mca_btl_ugni_smsg_release (&ep->smsg_ep_handle);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
BTL_ERROR(("Smsg release failed! rc = %d", rc));
|
||||
return OPAL_ERROR;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -49,10 +49,8 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
|
||||
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
||||
/* TODO -- need to handle overrun -- how do we do this without an event?
|
||||
will the event eventually come back? Ask Cray */
|
||||
BTL_ERROR(("post error! cq overrun = %d", (int)GNI_CQ_OVERRUN(event_data)));
|
||||
assert (0);
|
||||
return mca_btl_rc_ugni_to_opal (grc);
|
||||
* will the event eventually come back? Ask Cray */
|
||||
return mca_btl_ugni_event_fatal_error (grc, event_data);
|
||||
}
|
||||
|
||||
assert (GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG);
|
||||
@ -93,7 +91,7 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
|
||||
}
|
||||
}
|
||||
|
||||
(void) mca_btl_ugni_progress_local_smsg (ugni_module, endpoint->smsg_ep_handle->device);
|
||||
(void) mca_btl_ugni_progress_local_smsg (ugni_module, endpoint->smsg_ep_handle.device);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -104,7 +102,8 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
|
||||
}
|
||||
|
||||
BTL_ERROR(("GNI_SmsgSendWTag failed with rc = %d. handle = %lu, hdr_len = %d, payload_len = %d",
|
||||
grc, (uintptr_t) frag->endpoint->smsg_ep_handle, (int) hdr_len, (int) payload_len));
|
||||
grc, (uintptr_t) frag->endpoint->smsg_ep_handle.gni_handle, (int) hdr_len,
|
||||
(int) payload_len));
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user