From 12dccaa911da2e83586bbf450ef2c2057953d471 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 29 Dec 2015 07:57:22 -0800 Subject: [PATCH] Standardize the handling of shutdown in the OOB TCP component --- orte/mca/oob/tcp/oob_tcp_component.c | 35 ++++++++-------------------- orte/mca/oob/tcp/oob_tcp_component.h | 12 ++++++++++ 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index bc426e5069..bb0d1d9f60 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -970,10 +970,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); - /* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */ - if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { - goto cleanup; - } + MCA_OOB_TCP_CHECK_SHUTDOWN(pop); /* Mark that we no longer support this peer */ memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t)); @@ -987,7 +984,6 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) ORTE_ERROR_LOG(rc); } - cleanup: /* activate the proc state */ if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) { ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST); @@ -1010,6 +1006,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->hop)); + MCA_OOB_TCP_CHECK_SHUTDOWN(mop); + /* mark that we cannot reach this hop */ memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, @@ -1022,16 +1020,11 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) ORTE_ERROR_LOG(rc); } - /* report the error back to the OOB and let it try other components - * or declare a problem - */ - if (!orte_finalizing && !orte_abnormal_term_ordered) { - /* if this was a lifeline, then alert */ - if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) { - ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST); - } else { - ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); - } + /* if this was a lifeline, then alert */ + if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) { + ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST); + } else { + ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); } OBJ_RELEASE(mop); @@ -1049,11 +1042,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->hop)); - if (orte_finalizing || orte_abnormal_term_ordered) { - /* just ignore the problem */ - OBJ_RELEASE(mop); - return; - } + MCA_OOB_TCP_CHECK_SHUTDOWN(mop); /* mark that this component cannot reach this hop */ memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t)); @@ -1121,11 +1110,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); - /* if we are terminating, then don't attempt to reconnect */ - if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { - OBJ_RELEASE(pop); - return; - } + MCA_OOB_TCP_CHECK_SHUTDOWN(pop); /* activate the proc state */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, diff --git a/orte/mca/oob/tcp/oob_tcp_component.h b/orte/mca/oob/tcp/oob_tcp_component.h index fb35b86478..7fcbcb2f53 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.h +++ b/orte/mca/oob/tcp/oob_tcp_component.h @@ -92,4 +92,16 @@ ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_failed_to_connect(int fd, short ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata); ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata); +/* provide a macro for handling errors reported during shutdown */ +#define MCA_OOB_TCP_CHECK_SHUTDOWN(a) \ + do { \ + if (!orte_enable_recovery || \ + orte_orteds_term_ordered || \ + orte_finalizing || \ + orte_abnormal_term_ordered) { \ + OBJ_RELEASE(a); \ + return; \ + } \ + } while(0); + #endif /* _MCA_OOB_TCP_COMPONENT_H_ */