diff --git a/opal/mca/btl/ugni/btl_ugni_progress_thread.c b/opal/mca/btl/ugni/btl_ugni_progress_thread.c index d53d532118..e2d35cc4cc 100644 --- a/opal/mca/btl/ugni/btl_ugni_progress_thread.c +++ b/opal/mca/btl/ugni/btl_ugni_progress_thread.c @@ -30,7 +30,7 @@ static int thread_wakeups = 0; static void *mca_btl_ugni_prog_thread_fn(void * data) { - int rc; + int rc,ret = OPAL_SUCCESS; uint32_t which; gni_return_t status; gni_cq_handle_t cq_vec[2]; @@ -66,45 +66,65 @@ static void *mca_btl_ugni_prog_thread_fn(void * data) /* Send a signal to the main thread saying we are done */ rc = pthread_mutex_lock(&progress_mutex); - if (rc != 0) { - fprintf(stderr,"Hey pthread_mutex_lock failed\n"); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; } progress_thread_done = 1; rc = pthread_mutex_unlock(&progress_mutex); - if (rc != 0) { - fprintf(stderr,"Hey pthread_mutex_unlock failed\n"); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; } - rc = pthread_cond_signal(&progress_cond); - return OPAL_SUCCESS; + rc = pthread_cond_signal(&progress_cond); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_cond_signal returned %s ",strerror(rc))); + ret = OPAL_ERROR; + } + + fn_exit: + return ret; } int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl) { - int rc; + int rc, ret=OPAL_SUCCESS; pthread_attr_t attr; pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_attr_setdetachstate returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; + } rc = pthread_create(&mca_btl_ugni_progress_thread_id, &attr, mca_btl_ugni_prog_thread_fn, (void *)btl); - if (rc != 0) { - fprintf(stderr,"Hey, pthread_create returned with error %d (%s) \n",errno,strerror(errno)); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_create returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; } rc = pthread_attr_destroy(&attr); - if (rc != 0) { - fprintf(stderr,"Hey, pthread_attr_destroy returned with error %d (%s) \n",errno,strerror(errno)); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_attr_destory returned %s ",strerror(rc))); + ret = OPAL_ERROR; } - return OPAL_SUCCESS; + fn_exit: + return ret; } int mca_btl_ugni_kill_progress_thread(void) { + int rc, ret=OPAL_SUCCESS; gni_return_t status; static mca_btl_ugni_base_frag_t cq_write_frag; @@ -131,15 +151,32 @@ int mca_btl_ugni_kill_progress_thread(void) */ if (GNI_RC_SUCCESS != status) { BTL_ERROR(("GNI_PostCqWrite returned error - %s",gni_err_str[status])); + ret = opal_common_rc_ugni_to_opal(status); + goto fn_exit; } - pthread_mutex_lock(&progress_mutex); + rc = pthread_mutex_lock(&progress_mutex); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; + } while (!progress_thread_done) { pthread_cond_wait(&progress_cond, &progress_mutex); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_cond_wait returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; + } } - pthread_mutex_unlock(&progress_mutex); + rc = pthread_mutex_unlock(&progress_mutex); + if (0 != rc) { + BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc))); + ret = OPAL_ERROR; + goto fn_exit; + } /* * destroy the local_ep @@ -149,10 +186,12 @@ int mca_btl_ugni_kill_progress_thread(void) status = GNI_EpDestroy (mca_btl_ugni_component.modules[0].local_ep); OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) { - BTL_ERROR(("error destroy local ep endpoint - %s", gni_err_str[status])); - return opal_common_rc_ugni_to_opal(status); + BTL_ERROR(("GNI_EpDestroy returned error - %s", gni_err_str[status])); + ret = opal_common_rc_ugni_to_opal(status); + goto fn_exit; } - return OPAL_SUCCESS; + fn_exit: + return ret; } diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.c b/opal/mca/btl/ugni/btl_ugni_smsg.c index b8b6cf84fb..c4bf94f79e 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.c +++ b/opal/mca/btl/ugni/btl_ugni_smsg.c @@ -27,17 +27,8 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) { mbox->attr.smsg_attr.msg_buffer = base_reg->base; mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size; mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl; -#if 0 - fprintf(stderr,"ugni_reg->memory_hdl 0x%lx 0x%lx\n", - ugni_reg->memory_hdl.qword1,ugni_reg->memory_hdl.qword2); -#endif - mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME); mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; -#if 0 - fprintf(stderr,"Invoked mca_btl_ugni_smsg_mbox_construct with mbox->attr.rmt_irq_mem_hndl = 0x%lx 0x%lx\n", - mbox->attr.rmt_irq_mem_hndl.qword1,mbox->attr.rmt_irq_mem_hndl.qword2); -#endif } OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t, @@ -91,8 +82,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep) } if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { - fprintf (stderr, "Unhandled Smsg error: %s\n", gni_err_str[rc]); - assert (0); + BTL_ERROR(("GNI_SmsgGetNextWTag returned error %s", gni_err_str[rc])); return OPAL_ERROR; } diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.h b/opal/mca/btl/ugni/btl_ugni_smsg.h index 1c36638dc5..4298a5083b 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.h +++ b/opal/mca/btl/ugni/btl_ugni_smsg.h @@ -125,15 +125,10 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag, cq_write_frag->post_desc.base.remote_mem_hndl = frag->endpoint->rmt_irq_mem_hndl; cq_write_frag->post_desc.tries = 0; cq_write_frag->cbfunc = mca_btl_ugni_cqwrite_complete; -#if 0 - fprintf(stderr,"doing a GNI_PostCqWrite to 0x%lx 0x%lx \n",cq_write_frag->post_desc.base.remote_mem_hndl.qword1, - cq_write_frag->post_desc.base.remote_mem_hndl.qword2); -#endif OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); grc = GNI_PostCqWrite(frag->endpoint->rdma_ep_handle, &cq_write_frag->post_desc.base); OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); if (grc == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */ - fprintf(stderr,"GNI_PostCqWrite returned gni error %s\n",gni_err_str[grc]); mca_btl_ugni_frag_return (cq_write_frag); } }