initial thread safety for ugni btl
This commit adds initial ugni thread safety support. With this commit, sun thread tests (excepting MPI-2 RMA) pass with various process counts and threads/process. Also osu_latency_mt passes.
Этот коммит содержится в:
родитель
81917412a8
Коммит
9947758d98
@ -50,6 +50,7 @@ typedef struct mca_btl_ugni_endpoint_attr_t {
|
||||
uint64_t proc_id;
|
||||
uint32_t index;
|
||||
gni_smsg_attr_t smsg_attr;
|
||||
gni_mem_handle_t rmt_irq_mem_hndl;
|
||||
} mca_btl_ugni_endpoint_attr_t;
|
||||
|
||||
enum {
|
||||
@ -68,17 +69,23 @@ typedef struct mca_btl_ugni_module_t {
|
||||
opal_pointer_array_t endpoints;
|
||||
opal_hash_table_t id_to_endpoint;
|
||||
|
||||
/* lock for this list */
|
||||
opal_mutex_t failed_frags_lock;
|
||||
opal_list_t failed_frags;
|
||||
|
||||
mca_mpool_base_module_t *smsg_mpool;
|
||||
ompi_free_list_t smsg_mboxes;
|
||||
|
||||
gni_ep_handle_t wildcard_ep;
|
||||
gni_ep_handle_t local_ep;
|
||||
|
||||
struct mca_btl_ugni_endpoint_attr_t wc_remote_attr, wc_local_attr;
|
||||
|
||||
gni_cq_handle_t rdma_local_cq;
|
||||
gni_cq_handle_t smsg_remote_cq;
|
||||
gni_cq_handle_t smsg_local_cq;
|
||||
gni_cq_handle_t smsg_remote_irq_cq;
|
||||
gni_cq_handle_t rdma_local_irq_cq;
|
||||
|
||||
/* eager fragment list (registered) */
|
||||
ompi_free_list_t eager_frags_send;
|
||||
@ -91,6 +98,9 @@ typedef struct mca_btl_ugni_module_t {
|
||||
ompi_free_list_t rdma_frags;
|
||||
ompi_free_list_t rdma_int_frags;
|
||||
|
||||
|
||||
/* lock for this list */
|
||||
opal_mutex_t ep_wait_list_lock;
|
||||
/* endpoints waiting on credits */
|
||||
opal_list_t ep_wait_list;
|
||||
|
||||
@ -98,13 +108,13 @@ typedef struct mca_btl_ugni_module_t {
|
||||
opal_pointer_array_t pending_smsg_frags_bb;
|
||||
|
||||
uint32_t reg_max;
|
||||
uint32_t reg_count;
|
||||
volatile int reg_count;
|
||||
|
||||
/* used to calculate the fraction of registered memory resources
|
||||
* this rank should be limited too */
|
||||
int nlocal_procs;
|
||||
|
||||
int active_send_count;
|
||||
volatile int active_send_count;
|
||||
} mca_btl_ugni_module_t;
|
||||
|
||||
typedef struct mca_btl_ugni_component_t {
|
||||
@ -294,4 +304,5 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) {
|
||||
return ((uint64_t) (opal_process_name_jobid(name) & 0x7fffffff) << 32 | opal_process_name_vpid(name));
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -92,22 +92,28 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
|
||||
mca_btl_ugni_module_set_max_reg (ugni_module, ugni_module->nlocal_procs);
|
||||
|
||||
if (false == ugni_module->initialized) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.local_cq_size,
|
||||
0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->rdma_local_cq);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating local BTE/FMA CQ"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.local_cq_size,
|
||||
0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->smsg_local_cq);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating local SMSG CQ"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.remote_cq_size,
|
||||
0, GNI_CQ_NOBLOCK, NULL, NULL, &ugni_module->smsg_remote_cq);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating remote SMSG CQ"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
@ -179,14 +185,17 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
|
||||
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
|
||||
-1, &(ugni_reg->memory_hdl));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ugni_module->reg_count++;
|
||||
opal_atomic_add_32(&ugni_module->reg_count,1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -199,9 +208,11 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
|
||||
mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg;
|
||||
gni_return_t rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
|
||||
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
|
||||
&(ugni_reg->memory_hdl));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
@ -212,12 +223,14 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *)reg;
|
||||
gni_return_t rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
ugni_module->reg_count--;
|
||||
opal_atomic_add_32(&ugni_module->reg_count,-1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ btl_ugni_component_register(void)
|
||||
int rc;
|
||||
|
||||
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"Gemini byte transport layer");
|
||||
"uGNI byte transport layer");
|
||||
|
||||
mca_btl_ugni_component.ugni_free_list_num = 8;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
@ -63,7 +63,7 @@ btl_ugni_component_register(void)
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_free_list_num);
|
||||
mca_btl_ugni_component.ugni_free_list_max = 16384;
|
||||
mca_btl_ugni_component.ugni_free_list_max = 4096;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
@ -285,13 +285,6 @@ mca_btl_ugni_component_init (int *num_btl_modules,
|
||||
unsigned int i;
|
||||
int rc;
|
||||
|
||||
/* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */
|
||||
if (opal_using_threads() && !mca_btl_base_thread_multiple_override) {
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"btl:ugni: MPI_THREAD_MULTIPLE not supported; skipping this component");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
|
||||
mca_btl_ugni_component.ugni_smsg_limit = 16384;
|
||||
}
|
||||
@ -365,7 +358,9 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
|
||||
int count = 0, rc;
|
||||
|
||||
/* check for datagram completion */
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */
|
||||
grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
return 0;
|
||||
}
|
||||
@ -382,8 +377,10 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
|
||||
}
|
||||
|
||||
/* wait for the incoming datagram to complete (in case it isn't) */
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */
|
||||
grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state,
|
||||
&remote_addr, &remote_id);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != grc) {
|
||||
BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc));
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
@ -429,28 +426,34 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
|
||||
}
|
||||
|
||||
static inline int
|
||||
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module)
|
||||
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
||||
{
|
||||
opal_common_ugni_post_desc_t *desc;
|
||||
mca_btl_ugni_base_frag_t *frag;
|
||||
gni_cq_entry_t event_data = 0;
|
||||
uint32_t recoverable = 1;
|
||||
gni_return_t rc;
|
||||
gni_cq_handle_t the_cq;
|
||||
|
||||
rc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
|
||||
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_CqGetEvent (the_cq, &event_data);
|
||||
if (GNI_RC_NOT_DONE == rc) {
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
||||
/* TODO -- need to handle overrun -- how do we do this without an event?
|
||||
will the event eventually come back? Ask Cray */
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d", rc));
|
||||
assert (0);
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc]));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
rc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, (gni_post_descriptor_t **) &desc);
|
||||
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
@ -459,13 +462,18 @@ mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module)
|
||||
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
|
||||
char buffer[1024];
|
||||
|
||||
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
|
||||
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
|
||||
|
||||
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
|
||||
!recoverable)) {
|
||||
/* give up */
|
||||
BTL_ERROR(("giving up on frag %p", (void *) frag));
|
||||
frag->cbfunc (frag, OPAL_ERROR);
|
||||
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
|
||||
if (frag->cbfunc) {
|
||||
frag->cbfunc (frag, OPAL_ERROR);
|
||||
}
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
@ -478,7 +486,9 @@ mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module)
|
||||
|
||||
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
|
||||
|
||||
frag->cbfunc (frag, opal_common_rc_ugni_to_opal (rc));
|
||||
if (frag->cbfunc) {
|
||||
frag->cbfunc (frag, opal_common_rc_ugni_to_opal (rc));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -490,11 +500,15 @@ mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
|
||||
int i;
|
||||
|
||||
for (i = 0 ; i < count ; ++i) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock);
|
||||
mca_btl_ugni_base_frag_t *frag =
|
||||
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
|
||||
assert (NULL != frag);
|
||||
|
||||
frag->cbfunc (frag, OPAL_SUCCESS);
|
||||
if (frag->cbfunc) {
|
||||
frag->cbfunc (frag, OPAL_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
@ -503,24 +517,39 @@ mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
|
||||
static inline int
|
||||
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
|
||||
{
|
||||
int count = opal_list_get_size (&ugni_module->ep_wait_list);
|
||||
int rc, i;
|
||||
int rc = OPAL_SUCCESS;
|
||||
mca_btl_base_endpoint_t *endpoint = NULL;
|
||||
int count;
|
||||
|
||||
for (i = 0 ; i < count ; ++i) {
|
||||
mca_btl_base_endpoint_t *endpoint =
|
||||
(mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
|
||||
assert (NULL != endpoint);
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
count = opal_list_get_size(&ugni_module->ep_wait_list);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
|
||||
endpoint->wait_listed = false;
|
||||
do {
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
if (endpoint != NULL) {
|
||||
|
||||
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
|
||||
if (OPAL_SUCCESS != rc && false == endpoint->wait_listed) {
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
endpoint->wait_listed = true;
|
||||
endpoint->wait_listed = false;
|
||||
|
||||
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
|
||||
|
||||
if (OPAL_SUCCESS != rc && false == endpoint->wait_listed) {
|
||||
|
||||
endpoint->wait_listed = true;
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
--count;
|
||||
if (count == 0) break;
|
||||
|
||||
} while (endpoint != NULL) ;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int mca_btl_ugni_component_progress (void)
|
||||
@ -538,7 +567,8 @@ static int mca_btl_ugni_component_progress (void)
|
||||
count += mca_btl_ugni_progress_datagram (ugni_module);
|
||||
count += mca_btl_ugni_progress_local_smsg (ugni_module);
|
||||
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
|
||||
count += mca_btl_ugni_progress_rdma (ugni_module);
|
||||
count += mca_btl_ugni_progress_rdma (ugni_module, 0);
|
||||
|
||||
}
|
||||
|
||||
return count;
|
||||
|
@ -58,17 +58,24 @@ int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnec
|
||||
}
|
||||
|
||||
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state && send_disconnect) {
|
||||
OPAL_THREAD_LOCK(&ep->common->dev->dev_lock);
|
||||
rc = GNI_SmsgSendWTag (ep->smsg_ep_handle, NULL, 0, NULL, 0, -1,
|
||||
MCA_BTL_UGNI_TAG_DISCONNECT);
|
||||
OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("btl/ugni could not send close message"));
|
||||
}
|
||||
|
||||
/* we might want to wait for local completion here (do we even care) */
|
||||
/* we might want to wait for local completion here (do we even care), yes we do */
|
||||
/* TODO: FIX FIX FIX */
|
||||
|
||||
}
|
||||
|
||||
/* TODO: FIX GROSS */
|
||||
OPAL_THREAD_LOCK(&ep->common->dev->dev_lock);
|
||||
(void) opal_common_ugni_ep_destroy (&ep->smsg_ep_handle);
|
||||
(void) opal_common_ugni_ep_destroy (&ep->rdma_ep_handle);
|
||||
OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock);
|
||||
|
||||
OMPI_FREE_LIST_RETURN_MT(&ep->btl->smsg_mboxes, ((ompi_free_list_item_t *) ep->mailbox));
|
||||
ep->mailbox = NULL;
|
||||
@ -89,7 +96,7 @@ static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("initiaiting connection to remote peer with address: %u id: %u proc: %p",
|
||||
ep->common->ep_rem_addr, ep->common->ep_rem_id, ep->peer_proc));
|
||||
ep->common->ep_rem_addr, ep->common->ep_rem_id, (void *)ep->peer_proc));
|
||||
|
||||
/* bind endpoint to remote address */
|
||||
/* we bind two endpoints to seperate out local smsg completion and local fma completion */
|
||||
@ -150,6 +157,7 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
|
||||
GNI_EpSetEventData (ep->rdma_ep_handle, ep->index, ep->remote_attr.index);
|
||||
GNI_EpSetEventData (ep->smsg_ep_handle, ep->index, ep->remote_attr.index);
|
||||
|
||||
ep->rmt_irq_mem_hndl = ep->remote_attr.rmt_irq_mem_hndl;
|
||||
ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED;
|
||||
|
||||
/* send all pending messages */
|
||||
@ -158,7 +166,9 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
|
||||
rc = mca_btl_ugni_progress_send_wait_list (ep);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
ep->wait_listed = true;
|
||||
OPAL_THREAD_LOCK(&ep->btl->ep_wait_list_lock);
|
||||
opal_list_append (&ep->btl->ep_wait_list, &ep->super);
|
||||
OPAL_THREAD_UNLOCK(&ep->btl->ep_wait_list_lock);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -167,7 +177,8 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
|
||||
static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) {
|
||||
gni_return_t rc;
|
||||
|
||||
BTL_VERBOSE(("posting directed datagram to remote id: %d for endpoint %p", ep->common->ep_rem_id, ep));
|
||||
BTL_VERBOSE(("posting directed datagram to remote id: %d for endpoint %p", ep->common->ep_rem_id, (void *)ep));
|
||||
ep->mailbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
|
||||
|
||||
rc = GNI_EpPostDataWId (ep->smsg_ep_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr),
|
||||
&ep->remote_attr, sizeof (ep->remote_attr),
|
||||
@ -179,7 +190,7 @@ static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) {
|
||||
int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {
|
||||
int rc;
|
||||
|
||||
BTL_VERBOSE(("progressing connection for endpoint %p with state %d", ep, ep->state));
|
||||
BTL_VERBOSE(("progressing connection for endpoint %p with state %d", (void *)ep, ep->state));
|
||||
|
||||
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
|
||||
return OPAL_SUCCESS;
|
||||
|
@ -39,9 +39,11 @@ typedef struct mca_btl_base_endpoint_t {
|
||||
gni_ep_handle_t smsg_ep_handle;
|
||||
gni_ep_handle_t rdma_ep_handle;
|
||||
|
||||
mca_btl_ugni_endpoint_attr_t remote_attr;
|
||||
mca_btl_ugni_endpoint_attr_t remote_attr; /* TODO: UGH, remove this */
|
||||
|
||||
struct mca_btl_ugni_smsg_mbox_t *mailbox;
|
||||
gni_mem_handle_t rmt_irq_mem_hndl;
|
||||
|
||||
|
||||
opal_list_t frag_wait_list;
|
||||
bool wait_listed;
|
||||
|
@ -74,12 +74,14 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
|
||||
ugni_module->active_send_count = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->failed_frags_lock,opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_frags_send, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_frags_recv, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->smsg_frags, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->rdma_frags, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->rdma_int_frags, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
|
||||
@ -90,8 +92,10 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
|
||||
|
||||
/* create wildcard endpoint to listen for connections.
|
||||
* there is no need to bind this endpoint. */
|
||||
OPAL_THREAD_LOCK(&dev->dev_lock);
|
||||
rc = GNI_EpCreate (ugni_module->device->dev_handle, NULL,
|
||||
&ugni_module->wildcard_ep);
|
||||
OPAL_THREAD_UNLOCK(&dev->dev_lock);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("error creating wildcard ugni endpoint"));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
@ -136,6 +140,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
|
||||
}
|
||||
|
||||
/* destroy all cqs */
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_CqDestroy (ugni_module->rdma_local_cq);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error tearing down local BTE/FMA CQ"));
|
||||
@ -164,6 +169,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("btl/ugni error destroying endpoint"));
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&ugni_module->eager_frags_send);
|
||||
|
@ -36,7 +36,7 @@ static inline void init_gni_post_desc (mca_btl_ugni_base_frag_t *frag,
|
||||
frag->post_desc.base.remote_addr = (uint64_t) rem_addr;
|
||||
frag->post_desc.base.remote_mem_hndl = rem_mdh;
|
||||
frag->post_desc.base.length = bufsize;
|
||||
frag->post_desc.base.rdma_mode = 0;
|
||||
frag->post_desc.base.rdma_mode = GNI_RDMAMODE_FENCE;
|
||||
frag->post_desc.base.src_cq_hndl = cq_hndl;
|
||||
frag->post_desc.tries = 0;
|
||||
}
|
||||
@ -50,7 +50,9 @@ static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_pos
|
||||
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
|
||||
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, 0);
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
rc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -62,17 +64,19 @@ static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_pos
|
||||
static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type,
|
||||
mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg)
|
||||
{
|
||||
gni_return_t rc;
|
||||
gni_return_t status;
|
||||
|
||||
/* Post descriptor */
|
||||
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
|
||||
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len,
|
||||
frag->endpoint->btl->rdma_local_cq);
|
||||
|
||||
rc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", rc));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
status = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_RC_SUCCESS != status) {
|
||||
BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", status));
|
||||
return opal_common_rc_ugni_to_opal(status);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -99,16 +103,20 @@ static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag, int rc)
|
||||
|
||||
frag->cbfunc = mca_btl_ugni_frag_complete;
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_POST_RDMA_PUT == frag->post_desc.base.type ||
|
||||
GNI_POST_RDMA_GET == frag->post_desc.base.type) {
|
||||
grc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
} else {
|
||||
grc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
frag->cbfunc = mca_btl_ugni_repost;
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->btl->failed_frags_lock);
|
||||
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->btl->failed_frags_lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -32,7 +32,9 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
|
||||
rc = mca_btl_ugni_check_endpoint_state (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -71,11 +73,15 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
|
||||
if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
/* queue up request */
|
||||
if (false == endpoint->wait_listed) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
endpoint->wait_listed = true;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
rc = OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -133,22 +139,28 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
|
||||
int mca_btl_ugni_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_ugni_base_frag_t *frag;
|
||||
mca_btl_ugni_base_frag_t *frag=NULL;
|
||||
int rc;
|
||||
|
||||
while (NULL !=
|
||||
(frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list))) {
|
||||
do {
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
if (NULL == frag) {
|
||||
break;
|
||||
}
|
||||
rc = mca_btl_ugni_send_frag (endpoint, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS > rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
opal_list_prepend (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
} else {
|
||||
mca_btl_ugni_frag_complete (frag, rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -27,8 +27,17 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
|
||||
mbox->attr.smsg_attr.msg_buffer = base_reg->base;
|
||||
mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size;
|
||||
mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl;
|
||||
#if 0
|
||||
fprintf(stderr,"ugni_reg->memory_hdl 0x%lx 0x%lx\n",
|
||||
ugni_reg->memory_hdl.qword1,ugni_reg->memory_hdl.qword2);
|
||||
#endif
|
||||
|
||||
mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME);
|
||||
mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
|
||||
#if 0
|
||||
fprintf(stderr,"Invoked mca_btl_ugni_smsg_mbox_construct with mbox->attr.rmt_irq_mem_hndl = 0x%lx 0x%lx\n",
|
||||
mbox->attr.rmt_irq_mem_hndl.qword1,mbox->attr.rmt_irq_mem_hndl.qword2);
|
||||
#endif
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t,
|
||||
@ -42,7 +51,7 @@ int mca_btl_ugni_smsg_init (mca_btl_ugni_module_t *ugni_module)
|
||||
rc = GNI_SmsgSetMaxRetrans (ugni_module->device->dev_handle,
|
||||
mca_btl_ugni_component.smsg_max_retries);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
BTL_ERROR(("error setting maximum SMSG retries"));
|
||||
BTL_ERROR(("error setting maximum SMSG retries %s",gni_err_str[rc]));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
@ -70,9 +79,11 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
||||
do {
|
||||
uint8_t tag = GNI_SMSG_ANY_TAG;
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->common->dev->dev_lock);
|
||||
rc = GNI_SmsgGetNextWTag (ep->smsg_ep_handle, (void **) &data_ptr, &tag);
|
||||
OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock);
|
||||
if (GNI_RC_NOT_DONE == rc) {
|
||||
BTL_VERBOSE(("no smsg message waiting. rc = %d", rc));
|
||||
BTL_VERBOSE(("no smsg message waiting. rc = %s", gni_err_str[rc]));
|
||||
|
||||
ep->smsg_progressing = 0;
|
||||
|
||||
@ -80,7 +91,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
fprintf (stderr, "Unhandled Smsg error: %d\n", rc);
|
||||
fprintf (stderr, "Unhandled Smsg error: %s\n", gni_err_str[rc]);
|
||||
assert (0);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
@ -140,7 +151,9 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
||||
break;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->common->dev->dev_lock);
|
||||
rc = GNI_SmsgRelease (ep->smsg_ep_handle);
|
||||
OPAL_THREAD_UNLOCK(&ep->common->dev->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
BTL_ERROR(("Smsg release failed! rc = %d", rc));
|
||||
return OPAL_ERROR;
|
||||
@ -175,7 +188,9 @@ mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl)
|
||||
|
||||
/* clear out remote cq */
|
||||
do {
|
||||
OPAL_THREAD_LOCK(&btl->device->dev_lock);
|
||||
rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data);
|
||||
OPAL_THREAD_UNLOCK(&btl->device->dev_lock);
|
||||
} while (GNI_RC_NOT_DONE != rc);
|
||||
|
||||
endpoint_count = opal_pointer_array_get_size (&btl->endpoints);
|
||||
@ -207,7 +222,9 @@ int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl)
|
||||
gni_return_t grc;
|
||||
uint64_t inst_id;
|
||||
|
||||
OPAL_THREAD_LOCK(&btl->device->dev_lock);
|
||||
grc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data);
|
||||
OPAL_THREAD_UNLOCK(&btl->device->dev_lock);
|
||||
if (GNI_RC_NOT_DONE == grc) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -47,7 +47,9 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
grc = GNI_CqGetEvent (ugni_module->smsg_local_cq, &event_data);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_NOT_DONE == grc) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -69,7 +71,7 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
ugni_module->active_send_count--;
|
||||
opal_atomic_add_32(&ugni_module->active_send_count,-1);
|
||||
|
||||
frag->flags |= MCA_BTL_UGNI_FRAG_SMSG_COMPLETE;
|
||||
|
||||
@ -83,15 +85,18 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
|
||||
static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
|
||||
void *hdr, size_t hdr_len,
|
||||
void *payload, size_t payload_len,
|
||||
mca_btl_ugni_smsg_tag_t tag) {
|
||||
mca_btl_ugni_smsg_tag_t tag)
|
||||
{
|
||||
gni_return_t grc;
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len,
|
||||
payload, payload_len, frag->msg_id, tag);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
|
||||
if (OPAL_LIKELY(GNI_RC_SUCCESS == grc)) {
|
||||
/* increment the active send counter */
|
||||
frag->endpoint->btl->active_send_count++;
|
||||
opal_atomic_add_32(&frag->endpoint->btl->active_send_count,1);
|
||||
|
||||
(void) mca_btl_ugni_progress_local_smsg ((mca_btl_ugni_module_t *) frag->endpoint->btl);
|
||||
return OPAL_SUCCESS;
|
||||
|
@ -136,6 +136,8 @@ static int opal_common_ugni_device_init (opal_common_ugni_device_t *device,
|
||||
|
||||
OPAL_OUTPUT((-1, "Got NIC Addr: 0x%08x, CPU ID: %d", device->dev_addr, device->dev_id));
|
||||
|
||||
OBJ_CONSTRUCT(&device->dev_lock,opal_mutex_t);
|
||||
|
||||
/* Attach device to the communication domain */
|
||||
rc = GNI_CdmAttach (opal_common_ugni_module.cd_handle, device->dev_id,
|
||||
&device->dev_pe_addr, &device->dev_handle);
|
||||
@ -267,8 +269,9 @@ int opal_common_ugni_init (void)
|
||||
mca_btl_ugni_component.rdma_max_retries;
|
||||
|
||||
/* Create a communication domain */
|
||||
/* TODO - bte single should be removed when the IRQ problem is figured out */
|
||||
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
|
||||
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
|
||||
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL | GNI_CDM_MODE_BTE_SINGLE_CHANNEL;
|
||||
|
||||
/* collect uGNI information */
|
||||
rc = get_ptag(&opal_common_ugni_module.ptag);
|
||||
|
@ -33,6 +33,7 @@
|
||||
struct opal_common_ugni_modex_t {
|
||||
uint32_t addr;
|
||||
int id;
|
||||
gni_mem_handle_t irq_memhndl;
|
||||
};
|
||||
typedef struct opal_common_ugni_modex_t opal_common_ugni_modex_t;
|
||||
|
||||
@ -47,8 +48,10 @@ struct opal_common_ugni_device_t {
|
||||
uint32_t dev_addr;
|
||||
uint32_t dev_cpu_id;
|
||||
|
||||
size_t dev_ep_count;
|
||||
void *btl_ctx;
|
||||
size_t dev_ep_count;
|
||||
opal_mutex_t dev_lock;
|
||||
gni_mem_handle_t smsg_irq_mhndl;
|
||||
void *btl_ctx;
|
||||
};
|
||||
typedef struct opal_common_ugni_device_t opal_common_ugni_device_t;
|
||||
|
||||
|
@ -42,6 +42,7 @@ int opal_common_ugni_endpoint_for_proc (opal_common_ugni_device_t *dev, opal_pro
|
||||
|
||||
endpoint->ep_rem_addr = modex->addr;
|
||||
endpoint->ep_rem_id = modex->id;
|
||||
endpoint->ep_rem_irq_memhndl = modex->irq_memhndl;
|
||||
|
||||
endpoint->dev = dev;
|
||||
|
||||
@ -70,14 +71,21 @@ int opal_common_ugni_ep_create (opal_common_ugni_endpoint_t *cep, gni_cq_handle_
|
||||
}
|
||||
|
||||
/* create a uGNI endpoint handle and bind it to the remote peer */
|
||||
OPAL_THREAD_LOCK(&cep->dev->dev_lock);
|
||||
grc = GNI_EpCreate (cep->dev->dev_handle, cq, ep_handle);
|
||||
OPAL_THREAD_UNLOCK(&cep->dev->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&cep->dev->dev_lock);
|
||||
grc = GNI_EpBind (*ep_handle, cep->ep_rem_addr, cep->ep_rem_id);
|
||||
OPAL_THREAD_UNLOCK(&cep->dev->dev_lock);
|
||||
|
||||
if (GNI_RC_SUCCESS != grc) {
|
||||
OPAL_THREAD_LOCK(&cep->dev->dev_lock);
|
||||
GNI_EpDestroy (*ep_handle);
|
||||
OPAL_THREAD_UNLOCK(&cep->dev->dev_lock);
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
}
|
||||
|
||||
@ -92,6 +100,7 @@ int opal_common_ugni_ep_destroy (gni_ep_handle_t *ep)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* TODO: need to fix, may be outstanding tx's, etc. */
|
||||
rc = GNI_EpUnbind (*ep);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
/* should warn */
|
||||
|
@ -18,6 +18,7 @@ struct opal_common_ugni_device_t;
|
||||
struct opal_common_ugni_endpoint_t {
|
||||
opal_object_t super;
|
||||
uint32_t ep_rem_addr, ep_rem_id; /**< remote information */
|
||||
gni_mem_handle_t ep_rem_irq_memhndl;
|
||||
struct opal_common_ugni_device_t *dev; /**< device this endpoint is using */
|
||||
};
|
||||
typedef struct opal_common_ugni_endpoint_t opal_common_ugni_endpoint_t;
|
||||
|
@ -96,6 +96,7 @@ struct mca_mpool_udreg_module_t {
|
||||
struct mca_mpool_base_resources_t resources;
|
||||
ompi_free_list_t reg_list;
|
||||
mca_mpool_udreg_hugepage_t *huge_page;
|
||||
opal_mutex_t lock;
|
||||
void *udreg_handle;
|
||||
};
|
||||
typedef struct mca_mpool_udreg_module_t mca_mpool_udreg_module_t;
|
||||
|
@ -156,6 +156,8 @@ int mca_mpool_udreg_module_init(mca_mpool_udreg_module_t* mpool)
|
||||
cache_attr.modes |= UDREG_CC_MODE_USE_LAZY_DEREG;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&mpool->lock,opal_mutex_t);
|
||||
|
||||
strncpy (cache_attr.cache_name, mpool->resources.pool_name, UDREG_MAX_CACHENAME_LEN);
|
||||
cache_attr.max_entries = mpool->resources.max_entries;
|
||||
cache_attr.debug_mode = 0;
|
||||
@ -363,14 +365,17 @@ int mca_mpool_udreg_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
|
||||
if (false == bypass_cache) {
|
||||
/* Get a udreg entry for this region */
|
||||
OPAL_THREAD_LOCK(&mpool_udreg->lock);
|
||||
while (UDREG_RC_SUCCESS !=
|
||||
(urc = UDREG_Register (mpool_udreg->udreg_handle, addr, size, &udreg_entry))) {
|
||||
/* try to remove one unused reg and retry */
|
||||
if (!mca_mpool_udreg_evict (mpool)) {
|
||||
*reg = NULL;
|
||||
OPAL_THREAD_UNLOCK(&mpool_udreg->lock);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mpool_udreg->lock);
|
||||
|
||||
udreg_reg = (mca_mpool_base_registration_t *) udreg_entry->device_data;
|
||||
udreg_reg->mpool_context = udreg_entry;
|
||||
@ -444,7 +449,9 @@ int mca_mpool_udreg_deregister(struct mca_mpool_base_module_t *mpool,
|
||||
if (0 == reg->ref_count && reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) {
|
||||
mca_mpool_udreg_dereg_func (reg, mpool);
|
||||
} else if (!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) {
|
||||
OPAL_THREAD_LOCK(&mpool_udreg->lock);
|
||||
UDREG_DecrRefcount (mpool_udreg->udreg_handle, reg->mpool_context);
|
||||
OPAL_THREAD_UNLOCK(&mpool_udreg->lock);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -473,6 +480,7 @@ void mca_mpool_udreg_finalize(struct mca_mpool_base_module_t *mpool)
|
||||
|
||||
UDREG_CacheRelease (mpool_udreg->udreg_handle);
|
||||
OBJ_DESTRUCT(&mpool_udreg->reg_list);
|
||||
OBJ_DESTRUCT(&mpool_udreg->lock);
|
||||
}
|
||||
|
||||
int mca_mpool_udreg_ft_event(int state) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user