Start setting a flag when a port error is detected on the openib BTL.
At this point, it is just cleared (and ignored) so default behavior has not changed. However, future failover support can take advantage of this flag. Reviewed by Pasha Shamis. This commit was SVN r23204.
Этот коммит содержится в:
родитель
02cc0cde83
Коммит
27f070a575
@ -15,6 +15,7 @@
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -223,7 +224,7 @@ struct mca_btl_openib_component_t {
|
||||
int32_t apm_ports;
|
||||
uint32_t buffer_alignment; /**< Preferred communication buffer alignment in Bytes (must be power of two) */
|
||||
#if OPAL_HAVE_THREADS
|
||||
int32_t fatal_counter; /**< Counts number on fatal events that we got on all devices */
|
||||
int32_t error_counter; /**< Counts number on error events that we got on all devices */
|
||||
int async_pipe[2]; /**< Pipe for comunication with async event thread */
|
||||
int async_comp_pipe[2]; /**< Pipe for async thread comunication with main thread */
|
||||
pthread_t async_thread; /**< Async thread that will handle fatal errors */
|
||||
@ -356,6 +357,7 @@ typedef struct mca_btl_openib_device_t {
|
||||
bool pollme;
|
||||
#if OPAL_HAVE_THREADS
|
||||
volatile bool got_fatal_event;
|
||||
volatile bool got_port_event;
|
||||
#endif
|
||||
#if HAVE_XRC
|
||||
struct ibv_xrc_domain *xrc_domain;
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -340,18 +341,26 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po
|
||||
/* Set the flag to fatal */
|
||||
device->got_fatal_event = true;
|
||||
/* It is not critical to protect the counter */
|
||||
OPAL_THREAD_ADD32(&mca_btl_openib_component.fatal_counter, 1);
|
||||
OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
|
||||
case IBV_EVENT_CQ_ERR:
|
||||
case IBV_EVENT_QP_FATAL:
|
||||
case IBV_EVENT_QP_REQ_ERR:
|
||||
case IBV_EVENT_QP_ACCESS_ERR:
|
||||
case IBV_EVENT_PATH_MIG_ERR:
|
||||
case IBV_EVENT_SRQ_ERR:
|
||||
orte_show_help("help-mpi-btl-openib.txt", "of error event",
|
||||
true,orte_process_info.nodename, orte_process_info.pid,
|
||||
event.event_type, openib_event_to_str(event.event_type),
|
||||
xrc_event ? "true" : "false");
|
||||
break;
|
||||
case IBV_EVENT_PORT_ERR:
|
||||
orte_show_help("help-mpi-btl-openib.txt", "of error event",
|
||||
true,orte_process_info.nodename, orte_process_info.pid,
|
||||
event.event_type, openib_event_to_str(event.event_type),
|
||||
xrc_event ? "true" : "false");
|
||||
/* Set the flag to indicate port error */
|
||||
device->got_port_event = true;
|
||||
OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
|
||||
break;
|
||||
case IBV_EVENT_COMM_EST:
|
||||
case IBV_EVENT_PORT_ACTIVE:
|
||||
|
@ -551,8 +551,8 @@ static inline int param_register_int(const char* param_name, int default_value)
|
||||
#if OPAL_HAVE_THREADS
|
||||
static int start_async_event_thread(void)
|
||||
{
|
||||
/* Set the fatal counter to zero */
|
||||
mca_btl_openib_component.fatal_counter = 0;
|
||||
/* Set the error counter to zero */
|
||||
mca_btl_openib_component.error_counter = 0;
|
||||
|
||||
/* Create pipe for communication with async event thread */
|
||||
if(pipe(mca_btl_openib_component.async_pipe)) {
|
||||
@ -959,6 +959,7 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device)
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
device->got_fatal_event = false;
|
||||
device->got_port_event = false;
|
||||
if (write(mca_btl_openib_component.async_pipe[1],
|
||||
&device->ib_dev_context->async_fd, sizeof(int))<0){
|
||||
BTL_ERROR(("Failed to write to pipe [%d]",errno));
|
||||
@ -3503,7 +3504,7 @@ static int btl_openib_component_progress(void)
|
||||
|
||||
#if OPAL_HAVE_THREADS
|
||||
if(OPAL_UNLIKELY(mca_btl_openib_component.use_async_event_thread &&
|
||||
mca_btl_openib_component.fatal_counter)) {
|
||||
mca_btl_openib_component.error_counter)) {
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
@ -3519,8 +3520,8 @@ static int btl_openib_component_progress(void)
|
||||
#if OPAL_HAVE_THREADS
|
||||
error:
|
||||
/* Set the fatal counter to zero */
|
||||
mca_btl_openib_component.fatal_counter = 0;
|
||||
/* Lets found all fatal events */
|
||||
mca_btl_openib_component.error_counter = 0;
|
||||
/* Lets find all error events */
|
||||
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
mca_btl_openib_module_t* openib_btl =
|
||||
mca_btl_openib_component.openib_btls[i];
|
||||
@ -3528,6 +3529,10 @@ error:
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
|
||||
NULL, NULL);
|
||||
}
|
||||
if(openib_btl->device->got_port_event) {
|
||||
/* These are non-fatal so just ignore it. */
|
||||
openib_btl->device->got_port_event = false;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
#endif
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user