diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index b755080563..8984a98d50 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -15,6 +15,7 @@ * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. + * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -223,7 +224,7 @@ struct mca_btl_openib_component_t { int32_t apm_ports; uint32_t buffer_alignment; /**< Preferred communication buffer alignment in Bytes (must be power of two) */ #if OPAL_HAVE_THREADS - int32_t fatal_counter; /**< Counts number on fatal events that we got on all devices */ + int32_t error_counter; /**< Counts number on error events that we got on all devices */ int async_pipe[2]; /**< Pipe for comunication with async event thread */ int async_comp_pipe[2]; /**< Pipe for async thread comunication with main thread */ pthread_t async_thread; /**< Async thread that will handle fatal errors */ @@ -356,6 +357,7 @@ typedef struct mca_btl_openib_device_t { bool pollme; #if OPAL_HAVE_THREADS volatile bool got_fatal_event; + volatile bool got_port_event; #endif #if HAVE_XRC struct ibv_xrc_domain *xrc_domain; diff --git a/ompi/mca/btl/openib/btl_openib_async.c b/ompi/mca/btl/openib/btl_openib_async.c index a5ab6d5c6b..b9a302128e 100644 --- a/ompi/mca/btl/openib/btl_openib_async.c +++ b/ompi/mca/btl/openib/btl_openib_async.c @@ -2,6 +2,7 @@ * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. + * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -340,18 +341,26 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po /* Set the flag to fatal */ device->got_fatal_event = true; /* It is not critical to protect the counter */ - OPAL_THREAD_ADD32(&mca_btl_openib_component.fatal_counter, 1); + OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1); case IBV_EVENT_CQ_ERR: case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_SRQ_ERR: + orte_show_help("help-mpi-btl-openib.txt", "of error event", + true,orte_process_info.nodename, orte_process_info.pid, + event.event_type, openib_event_to_str(event.event_type), + xrc_event ? "true" : "false"); + break; case IBV_EVENT_PORT_ERR: orte_show_help("help-mpi-btl-openib.txt", "of error event", true,orte_process_info.nodename, orte_process_info.pid, event.event_type, openib_event_to_str(event.event_type), xrc_event ? "true" : "false"); + /* Set the flag to indicate port error */ + device->got_port_event = true; + OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1); break; case IBV_EVENT_COMM_EST: case IBV_EVENT_PORT_ACTIVE: diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index c241daa0e0..8b30c80fd4 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -551,8 +551,8 @@ static inline int param_register_int(const char* param_name, int default_value) #if OPAL_HAVE_THREADS static int start_async_event_thread(void) { - /* Set the fatal counter to zero */ - mca_btl_openib_component.fatal_counter = 0; + /* Set the error counter to zero */ + mca_btl_openib_component.error_counter = 0; /* Create pipe for communication with async event thread */ if(pipe(mca_btl_openib_component.async_pipe)) { @@ -959,6 +959,7 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) return OMPI_ERROR; } device->got_fatal_event = false; + device->got_port_event = false; if (write(mca_btl_openib_component.async_pipe[1], &device->ib_dev_context->async_fd, sizeof(int))<0){ BTL_ERROR(("Failed to write to pipe [%d]",errno)); @@ -3503,7 +3504,7 @@ static int btl_openib_component_progress(void) #if OPAL_HAVE_THREADS if(OPAL_UNLIKELY(mca_btl_openib_component.use_async_event_thread && - mca_btl_openib_component.fatal_counter)) { + mca_btl_openib_component.error_counter)) { goto error; } #endif @@ -3519,8 +3520,8 @@ static int btl_openib_component_progress(void) #if OPAL_HAVE_THREADS error: /* Set the fatal counter to zero */ - mca_btl_openib_component.fatal_counter = 0; - /* Lets found all fatal events */ + mca_btl_openib_component.error_counter = 0; + /* Lets find all error events */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { mca_btl_openib_module_t* openib_btl = mca_btl_openib_component.openib_btls[i]; @@ -3528,6 +3529,10 @@ error: openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); } + if(openib_btl->device->got_port_event) { + /* These are non-fatal so just ignore it. */ + openib_btl->device->got_port_event = false; + } } return count; #endif