1
1

ugni: update to latest btl code. bug fixes and cleanup

This commit was SVN r26529.
Этот коммит содержится в:
Nathan Hjelm 2012-05-31 20:02:41 +00:00
родитель 3ccd286de1
Коммит 71bffa5158
16 изменённых файлов: 421 добавлений и 392 удалений

Просмотреть файл

@ -24,7 +24,6 @@
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "opal/util/output.h"
#include "opal_stdint.h"
@ -67,7 +66,7 @@ typedef struct mca_btl_ugni_module_t {
gni_cq_handle_t smsg_remote_cq;
gni_cq_handle_t smsg_local_cq;
/* eager (registered) fragment list */
/* eager fragment list (registered) */
ompi_free_list_t eager_frags_send;
ompi_free_list_t eager_frags_recv;
@ -78,6 +77,9 @@ typedef struct mca_btl_ugni_module_t {
ompi_free_list_t rdma_frags;
ompi_free_list_t rdma_int_frags;
/* endpoints waiting on credits */
opal_list_t ep_wait_list;
/* fragment id bounce buffer (smsg msg ids are only 32 bits) */
opal_pointer_array_t pending_smsg_frags_bb;
@ -92,7 +94,7 @@ typedef struct mca_btl_ugni_component_t {
/* maximum supported btls. hardcoded to 1 for now */
uint32_t ugni_max_btls;
/* Maximum number of entries a completion queue can hold */
uint32_t cq_size;
uint32_t remote_cq_size;
uint32_t local_cq_size;
/* number of ugni modules */
@ -240,6 +242,8 @@ mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
mca_btl_base_descriptor_t *
mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,

Просмотреть файл

@ -14,6 +14,7 @@
#include "btl_ugni.h"
#include "btl_ugni_frag.h"
#include "btl_ugni_smsg.h"
static int
mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module);
@ -80,7 +81,7 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
return ompi_common_rc_ugni_to_ompi (rc);
}
rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.cq_size,
rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.remote_cq_size,
0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->smsg_remote_cq);
if (GNI_RC_SUCCESS != rc) {
BTL_ERROR(("error creating remote SMSG CQ"));
@ -92,6 +93,12 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
BTL_ERROR(("btl/ugni error setting up mpools/free lists"));
return rc;
}
rc = mca_btl_ugni_smsg_init (ugni_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_ERROR(("btl/ugni error initializing SMSG"));
return rc;
}
}
ugni_module->endpoint_count += nprocs;
@ -129,45 +136,41 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl,
return OMPI_SUCCESS;
}
static inline int ugni_reg_mem (mca_btl_ugni_module_t *ugni_module, void *base,
size_t size, mca_mpool_base_registration_t *reg,
gni_cq_handle_t cq, uint32_t flags)
{
mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg;
gni_return_t rc;
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, cq, flags, -1, &(ugni_reg->memory_hdl));
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
ugni_module->reg_count++;
return OMPI_SUCCESS;
}
static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
mca_mpool_base_registration_t *reg)
{
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data;
mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg;
gni_return_t rc;
if (ugni_module->reg_count >= ugni_module->reg_max) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
return ugni_reg_mem (ugni_module, base, size, reg, NULL,
GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
-1, &(ugni_reg->memory_hdl));
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
ugni_module->reg_count++;
return OMPI_SUCCESS;
}
static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
mca_mpool_base_registration_t *reg)
{
mca_btl_ugni_module_t *btl = (mca_btl_ugni_module_t *) reg_data;
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data;
mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg;
gni_return_t rc;
return ugni_reg_mem (btl, base, size, reg, btl->smsg_remote_cq,
GNI_MEM_READWRITE);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
&(ugni_reg->memory_hdl));
return ompi_common_rc_ugni_to_ompi (rc);
}
static int
@ -218,7 +221,7 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
}
rc = ompi_free_list_init_ex_new (&ugni_module->rdma_frags,
sizeof (mca_btl_ugni_rdma_frag_t), 8,
sizeof (mca_btl_ugni_rdma_frag_t), 64,
OBJ_CLASS(mca_btl_ugni_rdma_frag_t),
0, opal_cache_line_size,
mca_btl_ugni_component.ugni_free_list_num,
@ -308,8 +311,8 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
rc = ompi_free_list_init_new (&ugni_module->smsg_mboxes,
sizeof (mca_btl_ugni_smsg_mbox_t), 8,
OBJ_CLASS(mca_btl_ugni_smsg_mbox_t),
mca_btl_ugni_component.smsg_mbox_size, 64,
16, nprocs, mbox_increment,
mca_btl_ugni_component.smsg_mbox_size, 128,
32, nprocs, mbox_increment,
ugni_module->smsg_mpool);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_ERROR(("error creating smsg mailbox free list"));

Просмотреть файл

@ -77,13 +77,14 @@ btl_ugni_component_register(void)
mca_btl_ugni_component.ugni_eager_inc =
mca_btl_ugni_param_register_int("eager_inc", NULL, 16);
mca_btl_ugni_component.cq_size =
mca_btl_ugni_param_register_int("cq_size", NULL, 40000);
mca_btl_ugni_component.remote_cq_size =
mca_btl_ugni_param_register_int("remote_cq_size", "Remote SMSG completion queue "
"size (default 40000)", 40000);
mca_btl_ugni_component.local_cq_size =
mca_btl_ugni_param_register_int("local_cq_size", NULL, 8192);
mca_btl_ugni_param_register_int("local_cq_size", "Local completion queue size "
"(default 8192)", 8192);
/* SMSG limit. 0 - autoselect */
mca_btl_ugni_component.ugni_smsg_limit =
mca_btl_ugni_param_register_int("smsg_limit", "Maximum size message that "
"will be sent using the SMSG/MSGQ protocol "
@ -212,7 +213,28 @@ static void mca_btl_ugni_autoset_leave_pinned (void) {
static int mca_btl_ugni_smsg_setup (void) {
gni_smsg_attr_t tmp_smsg_attrib;
unsigned int mbox_size;
int rc;
size_t nprocs;
gni_return_t rc;
(void) ompi_proc_world (&nprocs);
if (0 == mca_btl_ugni_component.ugni_smsg_limit) {
/* auto-set the smsg limit based on the number of ranks */
if (nprocs <= 512) {
mca_btl_ugni_component.ugni_smsg_limit = 8192;
} else if (nprocs <= 1024) {
mca_btl_ugni_component.ugni_smsg_limit = 2048;
} else if (nprocs <= 8192) {
mca_btl_ugni_component.ugni_smsg_limit = 1024;
} else if (nprocs <= 16384) {
mca_btl_ugni_component.ugni_smsg_limit = 512;
} else {
mca_btl_ugni_component.ugni_smsg_limit = 256;
}
}
mca_btl_ugni_component.smsg_max_data = mca_btl_ugni_component.ugni_smsg_limit -
sizeof (mca_btl_ugni_send_frag_hdr_t);
/* calculate mailbox size */
tmp_smsg_attrib.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
@ -238,7 +260,6 @@ mca_btl_ugni_component_init (int *num_btl_modules,
struct mca_btl_base_module_t **base_modules;
mca_btl_ugni_module_t *ugni_modules;
unsigned int i;
size_t nprocs;
int rc;
/* Initialize ugni library and create communication domain */
@ -247,11 +268,7 @@ mca_btl_ugni_component_init (int *num_btl_modules,
return NULL;
}
/* Create and initialize modules
* Create one module per device
* One btl == One module
*/
/* Manju: I should set this automatically, not hardcoded */
/* Create and initialize one module per uGNI device */
mca_btl_ugni_component.ugni_num_btls = ompi_common_ugni_module.device_count;
BTL_VERBOSE(("btl/ugni initializing"));
@ -275,37 +292,17 @@ mca_btl_ugni_component_init (int *num_btl_modules,
mca_btl_ugni_autoset_leave_pinned ();
(void) ompi_proc_world (&nprocs);
if (0 == mca_btl_ugni_component.ugni_smsg_limit) {
/* auto-set the smsg limit based on the number of ranks */
if (nprocs <= 512) {
mca_btl_ugni_component.ugni_smsg_limit = 8192;
} else if (nprocs <= 1024) {
mca_btl_ugni_component.ugni_smsg_limit = 2048;
} else if (nprocs <= 8192) {
mca_btl_ugni_component.ugni_smsg_limit = 1024;
} else if (nprocs <= 16384) {
mca_btl_ugni_component.ugni_smsg_limit = 512;
} else {
mca_btl_ugni_component.ugni_smsg_limit = 256;
}
rc = mca_btl_ugni_smsg_setup ();
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return NULL;
}
mca_btl_ugni_component.smsg_max_data = mca_btl_ugni_component.ugni_smsg_limit -
sizeof (mca_btl_ugni_send_frag_hdr_t);
if (mca_btl_ugni_component.ugni_smsg_limit == mca_btl_ugni_module.super.btl_eager_limit) {
mca_btl_ugni_module.super.btl_eager_limit = mca_btl_ugni_component.smsg_max_data;
}
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
rc = mca_btl_ugni_smsg_setup ();
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return NULL;
}
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
mca_btl_ugni_module_t *ugni_module = ugni_modules + i;
@ -328,7 +325,7 @@ mca_btl_ugni_component_init (int *num_btl_modules,
}
static inline int
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *btl)
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
{
uint32_t remote_addr, remote_id;
mca_btl_base_endpoint_t *ep;
@ -339,17 +336,17 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *btl)
int count = 0;
/* check for datagram completion */
grc = GNI_PostDataProbeById (btl->device->dev_handle, &datagram_id);
grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
return 0;
}
if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) ==
MCA_BTL_UGNI_CONNECT_WILDCARD_ID) {
handle = btl->wildcard_ep;
handle = ugni_module->wildcard_ep;
} else {
handle =
btl->endpoints[(uint32_t)(datagram_id & 0xffffffffull)]->smsg_ep_handle;
ugni_module->endpoints[(uint32_t)(datagram_id & 0xffffffffull)]->smsg_ep_handle;
}
/* wait for the incoming datagram to complete (in case it isn't) */
@ -363,7 +360,7 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *btl)
BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, "
"peer = %d", datagram_id, post_state, remote_id));
ep = btl->endpoints[remote_id];
ep = ugni_module->endpoints[remote_id];
/* NTH: TODO -- error handling */
(void) mca_btl_ugni_ep_connect_progress (ep);
@ -373,24 +370,25 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *btl)
count = mca_btl_ugni_smsg_process (ep);
}
/* repost the wildcard datagram */
if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) ==
MCA_BTL_UGNI_CONNECT_WILDCARD_ID) {
mca_btl_ugni_wildcard_ep_post (btl);
mca_btl_ugni_wildcard_ep_post (ugni_module);
}
return count;
}
static inline int
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *btl)
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module)
{
ompi_common_ugni_post_desc_t *desc;
gni_return_t rc = GNI_RC_NOT_DONE;
mca_btl_ugni_base_frag_t *frag;
gni_cq_entry_t event_data = 0;
uint32_t recoverable = 1;
gni_return_t rc;
rc = GNI_CqGetEvent (btl->rdma_local_cq, &event_data);
rc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
if (GNI_RC_NOT_DONE == rc) {
return 0;
}
@ -398,12 +396,12 @@ mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *btl)
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
/* TODO -- need to handle overrun -- how do we do this without an event?
will the event eventually come back? Ask Cray */
BTL_ERROR(("post error! cq overrun = %d", (int)GNI_CQ_OVERRUN(event_data)));
BTL_ERROR(("unhandled post error! ugni rc = %d", rc));
assert (0);
return ompi_common_rc_ugni_to_ompi (rc);
}
rc = GNI_GetCompleted (btl->rdma_local_cq, event_data, (gni_post_descriptor_t **) &desc);
rc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, (gni_post_descriptor_t **) &desc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
return ompi_common_rc_ugni_to_ompi (rc);
@ -411,64 +409,84 @@ mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *btl)
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data))) {
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
if (OPAL_UNLIKELY(++desc->tries >= mca_btl_ugni_component.rdma_max_retries ||
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
!recoverable)) {
/* give up */
BTL_ERROR(("giving up on frag %p", (void *) frag))
BTL_ERROR(("giving up on frag %p", (void *) frag));
frag->cbfunc (frag, OMPI_ERROR);
return OMPI_ERROR;
}
/* repost transaction */
if (GNI_POST_RDMA_PUT == desc->base.type ||
GNI_POST_RDMA_GET == desc->base.type) {
rc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &desc->base);
} else {
rc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &desc->base);
}
mca_btl_ugni_repost (frag, OMPI_SUCCESS);
return ompi_common_rc_ugni_to_ompi (rc);
return 0;
}
frag->cbfunc (frag, OMPI_SUCCESS);
BTL_VERBOSE(("RDMA/FMA complete for frag %p", frag));
frag->cbfunc (frag, ompi_common_rc_ugni_to_ompi (rc));
return 1;
}
static int
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *btl)
static inline int
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
{
int count = opal_list_get_size (&btl->failed_frags);
opal_list_item_t *item;
int count = opal_list_get_size (&ugni_module->failed_frags);
int i;
while (count-- && NULL != (item = opal_list_remove_first (&btl->failed_frags))) {
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) item;
for (i = 0 ; i < count ; ++i) {
mca_btl_ugni_base_frag_t *frag =
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
assert (NULL != frag);
frag->cbfunc (frag, OMPI_SUCCESS);
}
return 0;
return count;
}
static int mca_btl_ugni_component_progress (void)
static inline int
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
{
mca_btl_ugni_module_t *btl;
unsigned int i;
int count = 0;
int count = opal_list_get_size (&ugni_module->ep_wait_list);
int rc, i;
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
btl = mca_btl_ugni_component.modules + i;
for (i = 0 ; i < count ; ++i) {
mca_btl_base_endpoint_t *endpoint =
(mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
assert (NULL != endpoint);
mca_btl_ugni_retry_failed (btl);
count += mca_btl_ugni_progress_datagram (btl);
count += mca_btl_ugni_progress_local_smsg (btl);
count += mca_btl_ugni_progress_remote_smsg (btl);
count += mca_btl_ugni_progress_rdma (btl);
rc = mca_btl_progress_send_wait_list (endpoint);
if (OMPI_SUCCESS != rc) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
}
}
return count;
}
static int mca_btl_ugni_component_progress (void)
{
mca_btl_ugni_module_t *ugni_module;
unsigned int i;
int count = 0;
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
ugni_module = mca_btl_ugni_component.modules + i;
mca_btl_ugni_retry_failed (ugni_module);
mca_btl_ugni_progress_wait_list (ugni_module);
count += mca_btl_ugni_progress_datagram (ugni_module);
count += mca_btl_ugni_progress_local_smsg (ugni_module);
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
count += mca_btl_ugni_progress_rdma (ugni_module);
}
return count;

Просмотреть файл

@ -11,47 +11,25 @@
*/
#include "btl_ugni.h"
#include "btl_ugni_endpoint.h"
#include "btl_ugni_frag.h"
#include "btl_ugni_smsg.h"
static void mca_btl_ugni_ep_construct (mca_btl_base_endpoint_t *ep);
static void mca_btl_ugni_ep_destruct (mca_btl_base_endpoint_t *ep);
OBJ_CLASS_INSTANCE(mca_btl_base_endpoint_t, opal_object_t,
mca_btl_ugni_ep_construct, mca_btl_ugni_ep_destruct);
static void mca_btl_ugni_ep_construct (mca_btl_base_endpoint_t *ep)
{
memset ((char *) ep + sizeof(ep->super), 0, sizeof (*ep) - sizeof (ep->super));
OBJ_CONSTRUCT(&ep->pending_list, opal_list_t);
OBJ_CONSTRUCT(&ep->pending_smsg_sends, opal_list_t);
OBJ_CONSTRUCT(&ep->frag_wait_list, opal_list_t);
OBJ_CONSTRUCT(&ep->lock, opal_mutex_t);
}
static void mca_btl_ugni_ep_destruct (mca_btl_base_endpoint_t *ep)
{
OBJ_DESTRUCT(&ep->pending_list);
OBJ_DESTRUCT(&ep->pending_smsg_sends);
OBJ_DESTRUCT(&ep->frag_wait_list);
OBJ_DESTRUCT(&ep->lock);
}
static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
struct mca_btl_ugni_reg_t *reg =
(struct mca_btl_ugni_reg_t *) mbox->super.registration;
/* initialize mailbox attributes */
mbox->smsg_attrib.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
mbox->smsg_attrib.msg_maxsize = mca_btl_ugni_component.ugni_smsg_limit;
mbox->smsg_attrib.mbox_maxcredit = mca_btl_ugni_component.smsg_max_credits;
mbox->smsg_attrib.mbox_offset = (uintptr_t) mbox->super.ptr - (uintptr_t) reg->base.alloc_base;
mbox->smsg_attrib.msg_buffer = reg->base.alloc_base;
mbox->smsg_attrib.buff_size = mca_btl_ugni_component.smsg_mbox_size;
mbox->smsg_attrib.mem_hndl = reg->memory_hdl;
}
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t,
mca_btl_ugni_smsg_mbox_construct, NULL);
OBJ_CLASS_INSTANCE(mca_btl_base_endpoint_t, opal_list_item_t,
mca_btl_ugni_ep_construct, mca_btl_ugni_ep_destruct);
static inline int mca_btl_ugni_ep_smsg_get_mbox (mca_btl_base_endpoint_t *ep) {
mca_btl_ugni_module_t *ugni_module = ep->btl;
@ -72,7 +50,7 @@ static inline int mca_btl_ugni_ep_smsg_get_mbox (mca_btl_base_endpoint_t *ep) {
}
int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect) {
int rc;
gni_return_t rc;
do {
if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) {
@ -93,10 +71,10 @@ int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnec
(void) ompi_common_ugni_ep_destroy (&ep->smsg_ep_handle);
(void) ompi_common_ugni_ep_destroy (&ep->rdma_ep_handle);
ep->state = MCA_BTL_UGNI_EP_STATE_INIT;
OMPI_FREE_LIST_RETURN(&ep->btl->smsg_mboxes, ((ompi_free_list_item_t *) ep->mailbox));
ep->mailbox = NULL;
ep->state = MCA_BTL_UGNI_EP_STATE_INIT;
} while (0);
return OMPI_SUCCESS;
@ -109,6 +87,7 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
ep->common->ep_rem_addr, ep->common->ep_rem_id));
/* bind endpoint to remote address */
/* we bind two endpoints to seperate out local smsg completion and local fma completion */
rc = ompi_common_ugni_ep_create (ep->common, ep->btl->smsg_local_cq, &ep->smsg_ep_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
@ -134,16 +113,8 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
return OMPI_SUCCESS;
}
static void mca_btl_ugni_retry_send (mca_btl_ugni_base_frag_t *frag, int rc)
{
rc = mca_btl_ugni_send (&frag->endpoint->btl->super, frag->endpoint, &frag->base, frag->hdr.send.lag >> 24);
if (OPAL_UNLIKELY(0 > rc)) {
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
}
}
static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
opal_list_item_t *item;
gni_return_t grc;
int rc;
BTL_VERBOSE(("finishing connection. remote attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, "
@ -160,30 +131,36 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
ep->mailbox->smsg_attrib.mem_hndl.qword2, ep->mailbox->smsg_attrib.mbox_offset,
ep->mailbox->smsg_attrib.mbox_maxcredit, ep->mailbox->smsg_attrib.msg_maxsize));
rc = GNI_SmsgInit (ep->smsg_ep_handle, &ep->mailbox->smsg_attrib, &ep->remote_smsg_attrib);
if (GNI_RC_SUCCESS != rc) {
BTL_ERROR(("error initializing SMSG protocol. rc = %d", rc));
grc = GNI_SmsgInit (ep->smsg_ep_handle, &ep->mailbox->smsg_attrib, &ep->remote_smsg_attrib);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
BTL_ERROR(("error initializing SMSG protocol. rc = %d", grc));
return ompi_common_rc_ugni_to_ompi (rc);
return ompi_common_rc_ugni_to_ompi (grc);
}
BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->pending_list)));
ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED;
/* post pending sends */
while (NULL != (item = opal_list_remove_first (&ep->pending_list))) {
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) item;
rc = mca_btl_ugni_send (&ep->btl->super, ep, &frag->base, frag->hdr.send.lag >> 24);
if (OPAL_UNLIKELY(0 > rc)) {
frag->cbfunc = mca_btl_ugni_retry_send;
opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) frag);
}
/* send all pending messages */
BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->frag_wait_list)));
rc = mca_btl_progress_send_wait_list (ep);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
opal_list_append (&ep->btl->ep_wait_list, &ep->super);
}
return OMPI_SUCCESS;
}
static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) {
gni_return_t rc;
rc = GNI_EpPostDataWId (ep->smsg_ep_handle, &ep->mailbox->smsg_attrib, sizeof (ep->mailbox->smsg_attrib),
&ep->remote_smsg_attrib, sizeof (ep->remote_smsg_attrib),
MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->common->ep_rem_id);
return ompi_common_rc_ugni_to_ompi (rc);
}
int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {
int rc;
@ -199,6 +176,7 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {
}
if (GNI_SMSG_TYPE_INVALID == ep->remote_smsg_attrib.msg_type) {
/* use datagram to exchange connection information with the remote peer */
rc = mca_btl_ugni_directed_ep_post (ep);
if (OMPI_SUCCESS == rc) {
rc = OMPI_ERR_RESOURCE_BUSY;

Просмотреть файл

@ -22,15 +22,10 @@ enum mca_btl_ugni_endpoint_state_t {
};
typedef enum mca_btl_ugni_endpoint_state_t mca_btl_ugni_endpoint_state_t;
typedef struct mca_btl_ugni_smsg_mbox_t {
ompi_free_list_item_t super;
gni_smsg_attr_t smsg_attrib;
} mca_btl_ugni_smsg_mbox_t;
OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_mbox_t);
struct mca_btl_ugni_smsg_mbox_t;
typedef struct mca_btl_base_endpoint_t {
opal_object_t super;
opal_list_item_t super;
opal_mutex_t lock;
mca_btl_ugni_endpoint_state_t state;
@ -44,10 +39,9 @@ typedef struct mca_btl_base_endpoint_t {
gni_smsg_attr_t remote_smsg_attrib;
mca_btl_ugni_smsg_mbox_t *mailbox;
struct mca_btl_ugni_smsg_mbox_t *mailbox;
opal_list_t pending_list;
opal_list_t pending_smsg_sends;
opal_list_t frag_wait_list;
int32_t smsg_progressing;
} mca_btl_base_endpoint_t;
@ -86,7 +80,7 @@ static inline void mca_btl_ugni_release_ep (mca_btl_base_endpoint_t *ep) {
int rc;
rc = mca_btl_ugni_ep_disconnect (ep, false);
if (OMPI_SUCCESS == rc) {
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_VERBOSE(("btl/ugni error disconnecting endpoint"));
}
@ -133,14 +127,4 @@ static inline int mca_btl_ugni_wildcard_ep_post (mca_btl_ugni_module_t *ugni_mod
return ompi_common_rc_ugni_to_ompi (rc);
}
static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) {
gni_return_t rc;
rc = GNI_EpPostDataWId (ep->smsg_ep_handle, &ep->mailbox->smsg_attrib, sizeof (ep->mailbox->smsg_attrib),
&ep->remote_smsg_attrib, sizeof (ep->remote_smsg_attrib),
MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->common->ep_rem_id);
return ompi_common_rc_ugni_to_ompi (rc);
}
#endif /* MCA_BTL_UGNI_ENDPOINT_H */

Просмотреть файл

@ -13,23 +13,12 @@
#include "btl_ugni.h"
#include "btl_ugni_frag.h"
static inline void mca_btl_ugni_smsg_frag_constructor (mca_btl_ugni_base_frag_t *frag)
static inline void mca_btl_ugni_base_frag_constructor (mca_btl_ugni_base_frag_t *frag)
{
/* send memory does not need to be registered so we do not need a mpool */
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
}
static inline void mca_btl_ugni_frag_destructor (mca_btl_ugni_base_frag_t *frag)
{
}
static inline void mca_btl_ugni_rdma_frag_constructor (mca_btl_ugni_base_frag_t *frag)
{
/* we don't need any buffer memory for rdma frags */
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
}
static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t *frag)
{
struct mca_btl_ugni_reg_t *reg =
@ -41,13 +30,13 @@ static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t
}
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_frag_t, mca_btl_base_descriptor_t,
mca_btl_ugni_smsg_frag_constructor, mca_btl_ugni_frag_destructor);
mca_btl_ugni_base_frag_constructor, NULL);
OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_frag_t, mca_btl_base_descriptor_t,
mca_btl_ugni_rdma_frag_constructor, mca_btl_ugni_frag_destructor);
mca_btl_ugni_base_frag_constructor, NULL);
OBJ_CLASS_INSTANCE(mca_btl_ugni_eager_frag_t, mca_btl_base_descriptor_t,
mca_btl_ugni_eager_frag_constructor, mca_btl_ugni_frag_destructor);
mca_btl_ugni_eager_frag_constructor, NULL);
void mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, mca_btl_ugni_module_t *ugni_module)
{

Просмотреть файл

@ -51,23 +51,28 @@ typedef union mca_btl_ugni_frag_hdr_t {
} mca_btl_ugni_frag_hdr_t;
enum {
MCA_BTL_UGNI_FRAG_BUFFERED = 1,
MCA_BTL_UGNI_FRAG_COMPLETE = 2,
MCA_BTL_UGNI_FRAG_EAGER = 4
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
MCA_BTL_UGNI_FRAG_IGNORE = 8 /* ignore local smsg completion */
};
struct mca_btl_ugni_base_frag_t;
typedef void (*frag_cb_t) (struct mca_btl_ugni_base_frag_t *, int);
typedef struct mca_btl_ugni_base_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segments[2];
mca_btl_ugni_frag_hdr_t hdr;
size_t hdr_size;
mca_btl_base_descriptor_t base;
size_t hdr_size;
mca_btl_ugni_frag_hdr_t hdr;
mca_btl_base_segment_t segments[2];
ompi_common_ugni_post_desc_t post_desc;
mca_btl_base_endpoint_t *endpoint;
mca_btl_ugni_reg_t *registration;
ompi_free_list_t *my_list;
uint32_t msg_id;
uint32_t flags;
void (*cbfunc) (struct mca_btl_ugni_base_frag_t*, int);
mca_btl_base_endpoint_t *endpoint;
mca_btl_ugni_reg_t *registration;
ompi_free_list_t *my_list;
uint32_t msg_id;
uint32_t flags;
frag_cb_t cbfunc;
} mca_btl_ugni_base_frag_t;
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t;
@ -115,9 +120,11 @@ static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag)
}
static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, int rc) {
/* call callback if specified */
frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE;
BTL_VERBOSE(("frag complete. flags = %d", frag->base.des_flags));
/* call callback if specified */
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
}

Просмотреть файл

@ -26,15 +26,11 @@ int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
size_t size = des->des_src->seg_len;
bool check;
int rc;
BTL_VERBOSE(("Using RDMA/FMA Get"));
/* Check if endpoint is connected */
rc = mca_btl_ugni_check_endpoint_state(endpoint);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc))
/* Ack! we should already be connected by this point (we got a smsg msg) */
return rc;
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
(void) mca_btl_ugni_check_endpoint_state(endpoint);
/* Check if the get is aligned/sized on a multiple of 4 */
check = !!((des->des_src->seg_addr.lval | des->des_dst->seg_addr.lval | size) & 3);
@ -44,23 +40,25 @@ int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
return OMPI_ERR_NOT_AVAILABLE;
}
if (NULL != frag->base.des_cbfunc) {
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
}
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (size <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (frag, GNI_POST_FMA_GET, des->des_dst, des->des_src);
}
return mca_btl_ugni_post (frag, true, des->des_dst, des->des_src);
}
return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET, des->des_dst, des->des_src);
static void mca_btl_ugni_frag_set_ownership (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int rc) {
desc->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
}
static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int rc)
{
BTL_VERBOSE(("rdma operation for rem_ctx %p complete", frag->hdr.rdma.ctx));
frag->base.des_cbfunc = mca_btl_ugni_frag_set_ownership;
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
/* tell peer the put is complete */
rc = ompi_mca_btl_ugni_smsg_send (frag, false, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
rc = ompi_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
if (OPAL_UNLIKELY(0 > rc)) {
/* call this callback again later */
@ -75,8 +73,10 @@ static void mca_btl_ugni_callback_eager_get_retry (mca_btl_ugni_base_frag_t *fra
(void) mca_btl_ugni_start_eager_get(frag->endpoint, frag->hdr.eager_ex, frag);
}
static void mca_btl_ugni_callback_eager_get (mca_btl_ugni_base_frag_t *frag, int rc)
static void mca_btl_ugni_callback_eager_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int rc)
{
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc;
uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff;
uint8_t tag = frag->hdr.eager.send.lag >> 24;
size_t payload_len = frag->hdr.eager.src_seg.seg_len;
@ -114,7 +114,6 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
mca_btl_ugni_base_frag_t *frag)
{
mca_btl_ugni_reg_t *registration;
int rc;
if (OPAL_UNLIKELY(frag && frag->my_list == &ep->btl->rdma_int_frags)) {
@ -137,8 +136,8 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
frag->hdr.eager_ex = hdr;
frag->base.des_cbfunc = NULL;
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get;
frag->base.des_flags = MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
@ -151,19 +150,10 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
frag->segments[0].seg_len = frag->segments[1].seg_len =
(hdr.eager.src_seg.seg_len + 3) & ~3;
if (frag->segments[0].seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
rc = mca_btl_ugni_post_fma (frag, GNI_POST_FMA_GET, frag->base.des_dst, frag->base.des_src);
} else {
rc = mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET, frag->base.des_dst, frag->base.des_src);
rc = mca_btl_ugni_post (frag, true, frag->base.des_dst, frag->base.des_src);
if (OPAL_UNLIKELY(OMPI_SUCCESS == rc)) {
return OMPI_SUCCESS;
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
break;
}
frag->cbfunc = mca_btl_ugni_callback_eager_get;
return OMPI_SUCCESS;
} while (0);
frag->cbfunc = mca_btl_ugni_callback_eager_get_retry;

Просмотреть файл

@ -12,28 +12,11 @@
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "opal/util/show_help.h"
#include "opal/align.h"
#include "ompi/mca/btl/base/base.h"
#include "ompi/mca/dpm/dpm.h"
#include "orte/util/proc_info.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "btl_ugni.h"
#include "btl_ugni_frag.h"
#include "btl_ugni_endpoint.h"
#include "btl_ugni_smsg.h"
#include "btl_ugni_prepare.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
static int
mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
mca_btl_base_descriptor_t *des);
@ -111,6 +94,7 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
OBJ_CONSTRUCT(&ugni_module->rdma_frags, ompi_free_list_t);
OBJ_CONSTRUCT(&ugni_module->rdma_int_frags, ompi_free_list_t);
OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
ugni_module->device = dev;
ugni_module->endpoints = NULL;
@ -132,13 +116,6 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
return rc;
}
rc = GNI_SmsgSetMaxRetrans (ugni_module->device->dev_handle,
mca_btl_ugni_component.smsg_max_retries);
if (GNI_RC_SUCCESS != rc) {
BTL_ERROR(("error setting maximum SMSG retries"));
return ompi_common_rc_ugni_to_ompi (rc);
}
return OMPI_SUCCESS;
}
@ -154,6 +131,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
OBJ_DESTRUCT(&ugni_module->smsg_frags);
OBJ_DESTRUCT(&ugni_module->rdma_frags);
OBJ_DESTRUCT(&ugni_module->rdma_int_frags);
OBJ_DESTRUCT(&ugni_module->ep_wait_list);
/* close all open connections and release endpoints */
if (NULL != ugni_module->endpoints) {
@ -252,7 +230,7 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
mca_btl_ugni_reg_t *registration;
frag->hdr_size = sizeof (frag->hdr.eager);
frag->flags |= MCA_BTL_UGNI_FRAG_EAGER;
frag->flags |= MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;

Просмотреть файл

@ -50,7 +50,7 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl,
return NULL;
}
frag->flags = MCA_BTL_UGNI_FRAG_EAGER;
frag->flags = MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
frag->registration = registration;
memcpy ((void *) frag->segments[1].seg_key.key64,
@ -98,7 +98,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
return NULL;
}
frag->flags = MCA_BTL_UGNI_FRAG_EAGER;
frag->flags = MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
@ -129,12 +129,6 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
return NULL;
}
if (max_size != *size) {
fprintf (stderr, "**** max_size = %d. iov.iov_len = %d\n", max_size,
iov.iov_len);
abort();
}
frag->segments[0].seg_len = reserve;
frag->segments[1].seg_addr.pval = frag->base.super.ptr;

Просмотреть файл

@ -25,24 +25,13 @@ int mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
int rc;
BTL_VERBOSE(("Using RDMA/FMA Put"));
BTL_VERBOSE(("Using RDMA/FMA Put for frag %p", (void *) des));
/* Check if endpoint is connected */
rc = mca_btl_ugni_check_endpoint_state(endpoint);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
/* Ack! We should already be connected by this point (we got an smsg msg) */
return rc;
}
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
(void) mca_btl_ugni_check_endpoint_state(endpoint);
if (NULL != frag->base.des_cbfunc) {
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
}
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (frag->base.des_src->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (frag, GNI_POST_FMA_PUT, des->des_src, des->des_dst);
}
return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_PUT, des->des_src, des->des_dst);
return mca_btl_ugni_post (frag, false, des->des_src, des->des_dst);
}

Просмотреть файл

@ -20,14 +20,14 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
mca_btl_ugni_base_frag_t *frag);
static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag,
gni_post_type_t op_type,
uint64_t lcl_addr,
gni_mem_handle_t *lcl_mdh,
uint64_t rem_addr,
gni_mem_handle_t *rem_mdh,
uint64_t bufsize,
gni_cq_handle_t cq_hndl) {
static inline void init_gni_post_desc (mca_btl_ugni_base_frag_t *frag,
gni_post_type_t op_type,
uint64_t lcl_addr,
gni_mem_handle_t *lcl_mdh,
uint64_t rem_addr,
gni_mem_handle_t *rem_mdh,
uint64_t bufsize,
gni_cq_handle_t cq_hndl) {
frag->post_desc.base.type = op_type;
frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
frag->post_desc.base.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
@ -38,12 +38,7 @@ static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag,
frag->post_desc.base.length = bufsize;
frag->post_desc.base.rdma_mode = 0;
frag->post_desc.base.src_cq_hndl = cq_hndl;
frag->cbfunc = mca_btl_ugni_frag_complete;
frag->post_desc.endpoint = frag->endpoint->common;
frag->post_desc.tries = 0;
return 0;
frag->post_desc.tries = 0;
}
static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type,
@ -86,4 +81,33 @@ static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_pos
return OMPI_SUCCESS;
}
static inline int mca_btl_ugni_post (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_base_segment_t *lcl_seg,
mca_btl_base_segment_t *rem_seg) {
frag->cbfunc = mca_btl_ugni_frag_complete;
if (frag->base.des_src->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (frag, get ? GNI_POST_FMA_GET : GNI_POST_FMA_PUT, lcl_seg, rem_seg);
}
return mca_btl_ugni_post_bte (frag, get ? GNI_POST_RDMA_GET : GNI_POST_RDMA_PUT, lcl_seg, rem_seg);
}
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag, int rc) {
gni_return_t grc;
frag->cbfunc = mca_btl_ugni_frag_complete;
if (GNI_POST_RDMA_PUT == frag->post_desc.base.type ||
GNI_POST_RDMA_GET == frag->post_desc.base.type) {
grc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
} else {
grc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
}
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
frag->cbfunc = mca_btl_ugni_repost;
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
}
}
#endif /* MCA_BTL_UGNI_RDMA_H */

Просмотреть файл

@ -15,32 +15,27 @@
#include "btl_ugni_smsg.h"
int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *btl_peer,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor;
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
bool use_eager_get = !!(frag->flags & MCA_BTL_UGNI_FRAG_EAGER);
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
int flags_save = frag->base.des_flags;
int rc;
BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %d", (void *)descriptor,
ORTE_PROC_MY_NAME->vpid, btl_peer->common->ep_rem_id, frag->segments[0].seg_len));
ORTE_PROC_MY_NAME->vpid, endpoint->common->ep_rem_id, frag->segments[0].seg_len));
/* tag and len are at the same location in eager and smsg frag hdrs */
frag->hdr.send.lag = (tag << 24) | size;
if (OPAL_UNLIKELY(use_eager_get)) {
frag->hdr.eager.src_seg = frag->segments[1];
frag->hdr.eager.ctx = (void *) frag;
}
frag->endpoint = endpoint;
frag->endpoint = btl_peer;
rc = mca_btl_ugni_check_endpoint_state (btl_peer);
rc = mca_btl_ugni_check_endpoint_state (endpoint);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
opal_list_append (&btl_peer->pending_list, (opal_list_item_t *) frag);
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
return OMPI_SUCCESS;
}
@ -48,38 +43,62 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
frag->base.des_flags &= ~(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
frag->flags &= ~MCA_BTL_UGNI_FRAG_COMPLETE;
rc = ompi_mca_btl_ugni_smsg_send (frag, use_eager_get, &frag->hdr.send, frag->hdr_size,
frag->segments[1].seg_addr.pval, use_eager_get ? 0 :
frag->segments[1].seg_len, use_eager_get ?
MCA_BTL_UGNI_TAG_GET_INIT : MCA_BTL_UGNI_TAG_SEND);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_LIKELY(frag->flags & MCA_BTL_UGNI_FRAG_COMPLETE)) {
/* fast path: remote side has received the frag */
frag->base.des_flags = flags_save;
mca_btl_ugni_frag_complete (frag, OMPI_SUCCESS);
return 1;
}
if (OPAL_LIKELY(!use_eager_get)) {
if (OPAL_LIKELY(frag->flags & MCA_BTL_UGNI_FRAG_COMPLETE)) {
/* fast path: remote side has received the frag */
frag->base.des_flags = flags_save;
mca_btl_ugni_frag_complete (frag, OMPI_SUCCESS);
if ((OMPI_SUCCESS == rc) && (frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (flags_save & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
/* fast(ish) path: btl owned buffered frag. report send as complete */
frag->base.des_flags = flags_save & ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return 1;
if (OPAL_LIKELY(flags_save & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
}
if ((frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (flags_save & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
/* fast(ish) path: btl owned buffered frag. report send as complete */
frag->base.des_flags = flags_save & ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (OPAL_LIKELY(flags_save & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
}
return 1;
}
return 1;
}
/* slow(ish) path: remote side hasn't received the frag. call the frag's callback when
we get the local smsg/msgq or remote rdma completion */
frag->base.des_flags = flags_save | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc)) {
/* queue up request */
if (0 == opal_list_get_size (&endpoint->frag_wait_list)) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
}
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
rc = OMPI_SUCCESS;
}
return rc;
}
int mca_btl_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
{
mca_btl_ugni_base_frag_t *frag;
int rc;
while (NULL !=
(frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list))) {
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS > rc)) {
if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc)) {
opal_list_prepend (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
} else {
mca_btl_ugni_frag_complete (frag, rc);
}
return rc;
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -30,22 +30,22 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
assert (length <= btl->btl_eager_limit && !(flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));
if (OPAL_UNLIKELY(OMPI_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint))) {
max_data = payload_size;
frag = (mca_btl_ugni_base_frag_t *)
mca_btl_ugni_prepare_src_send_buffered (btl, endpoint, convertor,
order, header_size, &max_data,
flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (OPAL_UNLIKELY(NULL == frag || OMPI_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint))) {
/* can't complete inline send if the endpoint is not already connected */
/* go ahead and start the connection */
*descriptor = mca_btl_ugni_alloc (btl, endpoint, order, length, flags);
if (NULL != frag) {
mca_btl_ugni_frag_return (frag);
}
return OMPI_ERR_RESOURCE_BUSY;
}
max_data = payload_size;
frag = (mca_btl_ugni_base_frag_t *) mca_btl_ugni_prepare_src_send_buffered (btl, endpoint,
convertor,
order, header_size,
&max_data, flags);
if (OPAL_UNLIKELY(NULL == frag)) {
*descriptor = NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
return !frag ? OMPI_ERR_OUT_OF_RESOURCE : OMPI_ERR_RESOURCE_BUSY;
}
assert (payload_size == max_data);
@ -53,29 +53,28 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
BTL_VERBOSE(("btl/ugni sending inline descriptor %p from %d -> %d. length = %u", (void *) frag,
ORTE_PROC_MY_NAME->vpid, endpoint->common->ep_rem_id, (unsigned int) length));
frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
frag->hdr.send.lag = (tag << 24) | length;
/* write match header (with MPI comm/tag/etc. info) */
memmove (frag->base.des_src[0].seg_addr.pval, header, header_size);
/* send message */
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_EAGER))) {
rc = ompi_mca_btl_ugni_smsg_send (frag, false, &frag->hdr.send_ex, frag->hdr_size,
frag->segments[1].seg_addr.pval, frag->segments[1].seg_len,
MCA_BTL_UGNI_TAG_SEND);
} else {
frag->hdr.eager.src_seg = frag->segments[1];
frag->hdr.eager.ctx = (void *) frag;
rc = ompi_mca_btl_ugni_smsg_send (frag, true, &frag->hdr.eager_ex, frag->hdr_size,
NULL, 0, MCA_BTL_UGNI_TAG_GET_INIT);
}
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
/* return this frag */
mca_btl_ugni_frag_return (frag);
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc)) {
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
/* queue up request */
if (0 == opal_list_get_size (&endpoint->frag_wait_list)) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
}
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
rc = OMPI_SUCCESS;
} else {
/* return this frag */
mca_btl_ugni_frag_return (frag);
*descriptor = NULL;
}
}
return rc;

Просмотреть файл

@ -13,6 +13,38 @@
#include "btl_ugni_smsg.h"
#include "btl_ugni_rdma.h"
static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
struct mca_btl_ugni_reg_t *reg =
(struct mca_btl_ugni_reg_t *) mbox->super.registration;
/* initialize mailbox attributes */
mbox->smsg_attrib.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
mbox->smsg_attrib.msg_maxsize = mca_btl_ugni_component.ugni_smsg_limit;
mbox->smsg_attrib.mbox_maxcredit = mca_btl_ugni_component.smsg_max_credits;
mbox->smsg_attrib.mbox_offset = (uintptr_t) mbox->super.ptr - (uintptr_t) reg->base.alloc_base;
mbox->smsg_attrib.msg_buffer = reg->base.alloc_base;
mbox->smsg_attrib.buff_size = mca_btl_ugni_component.smsg_mbox_size;
mbox->smsg_attrib.mem_hndl = reg->memory_hdl;
}
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t,
mca_btl_ugni_smsg_mbox_construct, NULL);
int mca_btl_ugni_smsg_init (mca_btl_ugni_module_t *ugni_module)
{
gni_return_t rc;
rc = GNI_SmsgSetMaxRetrans (ugni_module->device->dev_handle,
mca_btl_ugni_component.smsg_max_retries);
if (GNI_RC_SUCCESS != rc) {
BTL_ERROR(("error setting maximum SMSG retries"));
return ompi_common_rc_ugni_to_ompi (rc);
}
return OMPI_SUCCESS;
}
/* progress */
int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
{
@ -36,6 +68,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
rc = GNI_SmsgGetNextWTag (ep->smsg_ep_handle, (void **) &data_ptr, &tag);
if (GNI_RC_NOT_DONE == rc) {
BTL_VERBOSE(("no smsg message waiting. rc = %d", rc));
ep->smsg_progressing = 0;
return count;
@ -151,17 +184,19 @@ int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl)
{
mca_btl_base_endpoint_t *ep;
gni_cq_entry_t event_data;
gni_return_t rc;
gni_return_t grc;
uint64_t inst_id;
int rc;
rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data);
if (GNI_RC_NOT_DONE == rc) {
grc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data);
if (GNI_RC_NOT_DONE == grc) {
return 0;
}
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data) ||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data) ||
GNI_CQ_OVERRUN(event_data))) {
if (GNI_RC_ERROR_RESOURCE == rc ||
(GNI_RC_SUCCESS == rc && GNI_CQ_OVERRUN(event_data))) {
if (GNI_RC_ERROR_RESOURCE == grc ||
(GNI_RC_SUCCESS == grc && GNI_CQ_OVERRUN(event_data))) {
/* recover from smsg cq overrun */
return mca_btl_ugni_handle_remote_smsg_overrun (btl);
}
@ -170,7 +205,7 @@ int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl)
/* unhandled error: crash */
assert (0);
return ompi_common_rc_ugni_to_ompi (rc);
return ompi_common_rc_ugni_to_ompi (grc);
}
BTL_VERBOSE(("REMOTE CQ: Got event 0x%" PRIx64 ". msg id = %" PRIu64
@ -178,7 +213,9 @@ int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl)
GNI_CQ_GET_MSG_ID(event_data), GNI_CQ_STATUS_OK(event_data),
GNI_CQ_GET_TYPE(event_data)));
ep = btl->endpoints[GNI_CQ_GET_MSG_ID(event_data)];
inst_id = GNI_CQ_GET_INST_ID(event_data);
ep = btl->endpoints[inst_id & 0xffffffff];
if (OPAL_UNLIKELY(MCA_BTL_UGNI_EP_STATE_CONNECTED != ep->state)) {
/* due to the nature of datagrams we may get a smsg completion before
we get mailbox info from the peer */

Просмотреть файл

@ -25,6 +25,14 @@ typedef enum {
MCA_BTL_UGNI_TAG_RDMA_COMPLETE
} mca_btl_ugni_smsg_tag_t;
typedef struct mca_btl_ugni_smsg_mbox_t {
ompi_free_list_item_t super;
gni_smsg_attr_t smsg_attrib;
} mca_btl_ugni_smsg_mbox_t;
OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_mbox_t);
int mca_btl_ugni_smsg_init (mca_btl_ugni_module_t *ugni_module);
int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep);
int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl);
@ -32,51 +40,45 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
{
mca_btl_ugni_base_frag_t *frag;
gni_cq_entry_t event_data;
gni_return_t rc;
uint32_t msg_id;
gni_return_t grc;
rc = GNI_CqGetEvent (ugni_module->smsg_local_cq, &event_data);
if (GNI_RC_NOT_DONE == rc) {
grc = GNI_CqGetEvent (ugni_module->smsg_local_cq, &event_data);
if (GNI_RC_NOT_DONE == grc) {
return OMPI_SUCCESS;
}
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
/* TODO -- need to handle overrun -- how do we do this without an event?
will the event eventually come back? Ask Cray */
BTL_ERROR(("post error! cq overrun = %d", (int)GNI_CQ_OVERRUN(event_data)));
assert (0);
return ompi_common_rc_ugni_to_ompi (rc);
return ompi_common_rc_ugni_to_ompi (grc);
}
assert (GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG);
msg_id = GNI_CQ_GET_MSG_ID(event_data);
if ((uint32_t) -1 == msg_id) {
/* nothing to do */
return OMPI_SUCCESS;
}
frag = (mca_btl_ugni_base_frag_t *) opal_pointer_array_get_item (&ugni_module->pending_smsg_frags_bb,
msg_id);
assert (NULL != frag);
GNI_CQ_GET_MSG_ID(event_data));
if (OPAL_UNLIKELY(NULL == frag)) {
assert (0);
return OMPI_ERROR;
}
mca_btl_ugni_frag_complete (frag, rc);
if (!(frag->flags & MCA_BTL_UGNI_FRAG_IGNORE)) {
mca_btl_ugni_frag_complete (frag, OMPI_SUCCESS);
}
return 1;
}
static inline int ompi_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
const bool ignore_local_comp,
void *hdr, size_t hdr_len,
void *payload, size_t payload_len,
mca_btl_ugni_smsg_tag_t tag) {
gni_return_t grc;
grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len, payload, payload_len,
ignore_local_comp ? (uint32_t) -1 : frag->msg_id, tag);
grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len,
payload, payload_len, frag->msg_id, tag);
(void) mca_btl_ugni_progress_local_smsg ((mca_btl_ugni_module_t *) frag->endpoint->btl);
@ -84,19 +86,33 @@ static inline int ompi_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
return OMPI_SUCCESS;
}
/* see if we can free up some credits */
(void) mca_btl_ugni_progress_remote_smsg ((mca_btl_ugni_module_t *) frag->endpoint->btl);
if (OPAL_LIKELY(GNI_RC_NOT_DONE == grc)) {
BTL_VERBOSE(("out of credits"));
return OMPI_ERR_OUT_OF_RESOURCE;
}
(void) mca_btl_ugni_progress_remote_smsg ((mca_btl_ugni_module_t *) frag->endpoint->btl);
BTL_ERROR(("GNI_SmsgSendWTag failed with rc = %d. handle = %lu, hdr_len = %d, payload_len = %d",
grc, (uintptr_t) frag->endpoint->smsg_ep_handle, (int) hdr_len, (int) payload_len));
return OMPI_ERROR;
}
static inline int mca_btl_ugni_send_frag (struct mca_btl_base_endpoint_t *btl_peer,
mca_btl_ugni_base_frag_t *frag) {
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_EAGER))) {
return ompi_mca_btl_ugni_smsg_send (frag, &frag->hdr.send, frag->hdr_size,
frag->segments[1].seg_addr.pval, frag->segments[1].seg_len,
MCA_BTL_UGNI_TAG_SEND);
}
frag->hdr.eager.src_seg = frag->segments[1];
frag->hdr.eager.ctx = (void *) frag;
return ompi_mca_btl_ugni_smsg_send (frag, &frag->hdr.eager, frag->hdr_size,
NULL, 0, MCA_BTL_UGNI_TAG_GET_INIT);
}
#endif /* MCA_BTL_UGNI_SMSG_H */