1
1

Merge pull request #1267 from rhc54/topic/oob

Standardize the handling of shutdown in the OOB TCP component
Этот коммит содержится в:
rhc54 2015-12-29 09:10:42 -08:00
родитель ddf0f272e1 12dccaa911
Коммит dc857a9b24
2 изменённых файлов: 22 добавлений и 25 удалений

Просмотреть файл

@ -970,10 +970,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pop->peer));
/* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */
if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
goto cleanup;
}
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
/* Mark that we no longer support this peer */
memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t));
@ -987,7 +984,6 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
ORTE_ERROR_LOG(rc);
}
cleanup:
/* activate the proc state */
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
@ -1010,6 +1006,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mop->hop));
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
/* mark that we cannot reach this hop */
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
@ -1022,16 +1020,11 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
ORTE_ERROR_LOG(rc);
}
/* report the error back to the OOB and let it try other components
* or declare a problem
*/
if (!orte_finalizing && !orte_abnormal_term_ordered) {
/* if this was a lifeline, then alert */
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
}
/* if this was a lifeline, then alert */
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
}
OBJ_RELEASE(mop);
@ -1049,11 +1042,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mop->hop));
if (orte_finalizing || orte_abnormal_term_ordered) {
/* just ignore the problem */
OBJ_RELEASE(mop);
return;
}
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
/* mark that this component cannot reach this hop */
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
@ -1121,11 +1110,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pop->peer));
/* if we are terminating, then don't attempt to reconnect */
if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
OBJ_RELEASE(pop);
return;
}
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
/* activate the proc state */
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,

Просмотреть файл

@ -92,4 +92,16 @@ ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_failed_to_connect(int fd, short
ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata);
ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata);
/* provide a macro for handling errors reported during shutdown */
#define MCA_OOB_TCP_CHECK_SHUTDOWN(a) \
do { \
if (!orte_enable_recovery || \
orte_orteds_term_ordered || \
orte_finalizing || \
orte_abnormal_term_ordered) { \
OBJ_RELEASE(a); \
return; \
} \
} while(0);
#endif /* _MCA_OOB_TCP_COMPONENT_H_ */