diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index bb0d1d9f60..bc426e5069 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -970,7 +970,10 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); - MCA_OOB_TCP_CHECK_SHUTDOWN(pop); + /* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */ + if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { + goto cleanup; + } /* Mark that we no longer support this peer */ memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t)); @@ -984,6 +987,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) ORTE_ERROR_LOG(rc); } + cleanup: /* activate the proc state */ if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) { ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST); @@ -1006,8 +1010,6 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->hop)); - MCA_OOB_TCP_CHECK_SHUTDOWN(mop); - /* mark that we cannot reach this hop */ memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, @@ -1020,11 +1022,16 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) ORTE_ERROR_LOG(rc); } - /* if this was a lifeline, then alert */ - if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) { - ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST); - } else { - ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); + /* report the error back to the OOB and let it try other components + * or declare a problem + */ + if (!orte_finalizing && !orte_abnormal_term_ordered) { + /* if this was a lifeline, then alert */ + if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) { + ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST); + } else { + ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); + } } OBJ_RELEASE(mop); @@ -1042,7 +1049,11 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->hop)); - MCA_OOB_TCP_CHECK_SHUTDOWN(mop); + if (orte_finalizing || orte_abnormal_term_ordered) { + /* just ignore the problem */ + OBJ_RELEASE(mop); + return; + } /* mark that this component cannot reach this hop */ memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t)); @@ -1110,7 +1121,11 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); - MCA_OOB_TCP_CHECK_SHUTDOWN(pop); + /* if we are terminating, then don't attempt to reconnect */ + if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { + OBJ_RELEASE(pop); + return; + } /* activate the proc state */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, diff --git a/orte/mca/oob/tcp/oob_tcp_component.h b/orte/mca/oob/tcp/oob_tcp_component.h index 7fcbcb2f53..fb35b86478 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.h +++ b/orte/mca/oob/tcp/oob_tcp_component.h @@ -92,16 +92,4 @@ ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_failed_to_connect(int fd, short ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata); ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata); -/* provide a macro for handling errors reported during shutdown */ -#define MCA_OOB_TCP_CHECK_SHUTDOWN(a) \ - do { \ - if (!orte_enable_recovery || \ - orte_orteds_term_ordered || \ - orte_finalizing || \ - orte_abnormal_term_ordered) { \ - OBJ_RELEASE(a); \ - return; \ - } \ - } while(0); - #endif /* _MCA_OOB_TCP_COMPONENT_H_ */