btl/ugni: improve error handling
Improve error handling when pthread functions return errors. Remove stale debug code.
Этот коммит содержится в:
родитель
f8e354ce00
Коммит
065c756860
@ -30,7 +30,7 @@ static int thread_wakeups = 0;
|
|||||||
|
|
||||||
static void *mca_btl_ugni_prog_thread_fn(void * data)
|
static void *mca_btl_ugni_prog_thread_fn(void * data)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc,ret = OPAL_SUCCESS;
|
||||||
uint32_t which;
|
uint32_t which;
|
||||||
gni_return_t status;
|
gni_return_t status;
|
||||||
gni_cq_handle_t cq_vec[2];
|
gni_cq_handle_t cq_vec[2];
|
||||||
@ -66,45 +66,65 @@ static void *mca_btl_ugni_prog_thread_fn(void * data)
|
|||||||
|
|
||||||
/* Send a signal to the main thread saying we are done */
|
/* Send a signal to the main thread saying we are done */
|
||||||
rc = pthread_mutex_lock(&progress_mutex);
|
rc = pthread_mutex_lock(&progress_mutex);
|
||||||
if (rc != 0) {
|
if (0 != rc) {
|
||||||
fprintf(stderr,"Hey pthread_mutex_lock failed\n");
|
BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
progress_thread_done = 1;
|
progress_thread_done = 1;
|
||||||
|
|
||||||
rc = pthread_mutex_unlock(&progress_mutex);
|
rc = pthread_mutex_unlock(&progress_mutex);
|
||||||
if (rc != 0) {
|
if (0 != rc) {
|
||||||
fprintf(stderr,"Hey pthread_mutex_unlock failed\n");
|
BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
}
|
}
|
||||||
rc = pthread_cond_signal(&progress_cond);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
rc = pthread_cond_signal(&progress_cond);
|
||||||
|
if (0 != rc) {
|
||||||
|
BTL_ERROR(("btl/ugni pthread_cond_signal returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn_exit:
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl)
|
int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc, ret=OPAL_SUCCESS;
|
||||||
pthread_attr_t attr;
|
pthread_attr_t attr;
|
||||||
|
|
||||||
pthread_attr_init(&attr);
|
pthread_attr_init(&attr);
|
||||||
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
|
rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
|
||||||
|
if (0 != rc) {
|
||||||
|
BTL_ERROR(("btl/ugni pthread_attr_setdetachstate returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
|
}
|
||||||
|
|
||||||
rc = pthread_create(&mca_btl_ugni_progress_thread_id,
|
rc = pthread_create(&mca_btl_ugni_progress_thread_id,
|
||||||
&attr, mca_btl_ugni_prog_thread_fn, (void *)btl);
|
&attr, mca_btl_ugni_prog_thread_fn, (void *)btl);
|
||||||
if (rc != 0) {
|
if (0 != rc) {
|
||||||
fprintf(stderr,"Hey, pthread_create returned with error %d (%s) \n",errno,strerror(errno));
|
BTL_ERROR(("btl/ugni pthread_create returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = pthread_attr_destroy(&attr);
|
rc = pthread_attr_destroy(&attr);
|
||||||
if (rc != 0) {
|
if (0 != rc) {
|
||||||
fprintf(stderr,"Hey, pthread_attr_destroy returned with error %d (%s) \n",errno,strerror(errno));
|
BTL_ERROR(("btl/ugni pthread_attr_destory returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
fn_exit:
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mca_btl_ugni_kill_progress_thread(void)
|
int mca_btl_ugni_kill_progress_thread(void)
|
||||||
{
|
{
|
||||||
|
int rc, ret=OPAL_SUCCESS;
|
||||||
gni_return_t status;
|
gni_return_t status;
|
||||||
static mca_btl_ugni_base_frag_t cq_write_frag;
|
static mca_btl_ugni_base_frag_t cq_write_frag;
|
||||||
|
|
||||||
@ -131,15 +151,32 @@ int mca_btl_ugni_kill_progress_thread(void)
|
|||||||
*/
|
*/
|
||||||
if (GNI_RC_SUCCESS != status) {
|
if (GNI_RC_SUCCESS != status) {
|
||||||
BTL_ERROR(("GNI_PostCqWrite returned error - %s",gni_err_str[status]));
|
BTL_ERROR(("GNI_PostCqWrite returned error - %s",gni_err_str[status]));
|
||||||
|
ret = opal_common_rc_ugni_to_opal(status);
|
||||||
|
goto fn_exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_mutex_lock(&progress_mutex);
|
rc = pthread_mutex_lock(&progress_mutex);
|
||||||
|
if (0 != rc) {
|
||||||
|
BTL_ERROR(("btl/ugni pthread_mutex_lock returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
|
}
|
||||||
|
|
||||||
while (!progress_thread_done) {
|
while (!progress_thread_done) {
|
||||||
pthread_cond_wait(&progress_cond, &progress_mutex);
|
pthread_cond_wait(&progress_cond, &progress_mutex);
|
||||||
|
if (0 != rc) {
|
||||||
|
BTL_ERROR(("btl/ugni pthread_cond_wait returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_mutex_unlock(&progress_mutex);
|
rc = pthread_mutex_unlock(&progress_mutex);
|
||||||
|
if (0 != rc) {
|
||||||
|
BTL_ERROR(("btl/ugni pthread_mutex_unlock returned %s ",strerror(rc)));
|
||||||
|
ret = OPAL_ERROR;
|
||||||
|
goto fn_exit;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* destroy the local_ep
|
* destroy the local_ep
|
||||||
@ -149,10 +186,12 @@ int mca_btl_ugni_kill_progress_thread(void)
|
|||||||
status = GNI_EpDestroy (mca_btl_ugni_component.modules[0].local_ep);
|
status = GNI_EpDestroy (mca_btl_ugni_component.modules[0].local_ep);
|
||||||
OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock);
|
OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock);
|
||||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) {
|
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != status)) {
|
||||||
BTL_ERROR(("error destroy local ep endpoint - %s", gni_err_str[status]));
|
BTL_ERROR(("GNI_EpDestroy returned error - %s", gni_err_str[status]));
|
||||||
return opal_common_rc_ugni_to_opal(status);
|
ret = opal_common_rc_ugni_to_opal(status);
|
||||||
|
goto fn_exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
fn_exit:
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,17 +27,8 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
|
|||||||
mbox->attr.smsg_attr.msg_buffer = base_reg->base;
|
mbox->attr.smsg_attr.msg_buffer = base_reg->base;
|
||||||
mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size;
|
mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size;
|
||||||
mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl;
|
mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl;
|
||||||
#if 0
|
|
||||||
fprintf(stderr,"ugni_reg->memory_hdl 0x%lx 0x%lx\n",
|
|
||||||
ugni_reg->memory_hdl.qword1,ugni_reg->memory_hdl.qword2);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME);
|
mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME);
|
||||||
mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
|
mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
|
||||||
#if 0
|
|
||||||
fprintf(stderr,"Invoked mca_btl_ugni_smsg_mbox_construct with mbox->attr.rmt_irq_mem_hndl = 0x%lx 0x%lx\n",
|
|
||||||
mbox->attr.rmt_irq_mem_hndl.qword1,mbox->attr.rmt_irq_mem_hndl.qword2);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t,
|
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_mbox_t, ompi_free_list_item_t,
|
||||||
@ -91,8 +82,7 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||||
fprintf (stderr, "Unhandled Smsg error: %s\n", gni_err_str[rc]);
|
BTL_ERROR(("GNI_SmsgGetNextWTag returned error %s", gni_err_str[rc]));
|
||||||
assert (0);
|
|
||||||
return OPAL_ERROR;
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -125,15 +125,10 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag,
|
|||||||
cq_write_frag->post_desc.base.remote_mem_hndl = frag->endpoint->rmt_irq_mem_hndl;
|
cq_write_frag->post_desc.base.remote_mem_hndl = frag->endpoint->rmt_irq_mem_hndl;
|
||||||
cq_write_frag->post_desc.tries = 0;
|
cq_write_frag->post_desc.tries = 0;
|
||||||
cq_write_frag->cbfunc = mca_btl_ugni_cqwrite_complete;
|
cq_write_frag->cbfunc = mca_btl_ugni_cqwrite_complete;
|
||||||
#if 0
|
|
||||||
fprintf(stderr,"doing a GNI_PostCqWrite to 0x%lx 0x%lx \n",cq_write_frag->post_desc.base.remote_mem_hndl.qword1,
|
|
||||||
cq_write_frag->post_desc.base.remote_mem_hndl.qword2);
|
|
||||||
#endif
|
|
||||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||||
grc = GNI_PostCqWrite(frag->endpoint->rdma_ep_handle, &cq_write_frag->post_desc.base);
|
grc = GNI_PostCqWrite(frag->endpoint->rdma_ep_handle, &cq_write_frag->post_desc.base);
|
||||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||||
if (grc == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */
|
if (grc == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */
|
||||||
fprintf(stderr,"GNI_PostCqWrite returned gni error %s\n",gni_err_str[grc]);
|
|
||||||
mca_btl_ugni_frag_return (cq_write_frag);
|
mca_btl_ugni_frag_return (cq_write_frag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user