Fix a race condition between the orteds and HNP that can cause the orteds to output the "lost lifeline" message.
This has been a long-time problem. I tried to reduce the problem by having the orteds tell the HNP they were finalizing, and having the HNP wait until all orteds had reported or we timed out. What was observed was that all the orteds were correctly reporting that they are leaving, but the HNP is able to exit before the orteds, thus closing the orteds lifeline socket and generating the error output. This is caused by the fact that the orteds have to whack all remaining session directories, which includes that blasted monster shared memory file! Cleaning up the SM file can take quite a while. The HNP doesn't have that problem as there is no SM file there! So it gets out first. What we had done in the past to resolve that problem was put a little test in the OOB that checks to see if we are finalizing. If we are, then we ignore the lifeline connection being lost. That check was still in the code - however, we had lost the line in orte_finalize that set the flag!! This commit was SVN r17893.
Этот коммит содержится в:
родитель
8ee26a55ca
Коммит
27a73ad9ee
@ -257,8 +257,9 @@ int orte_ess_base_orted_finalize(void)
|
||||
orte_grpcomm_base_close();
|
||||
orte_routed_base_close();
|
||||
orte_rml_base_close();
|
||||
|
||||
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
||||
|
||||
/* cleanup any lingering session directories */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* clean out the global structures */
|
||||
orte_sys_info_finalize();
|
||||
|
@ -576,11 +576,10 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
peer->peer_state);
|
||||
}
|
||||
|
||||
/* if we lose the connection to the lifeline - abort */
|
||||
if (NULL != ORTE_PROC_MY_LIFELINE &&
|
||||
/* if we lose the connection to the lifeline and we are NOT already in finalize - abort */
|
||||
if (!orte_finalizing &&
|
||||
NULL != ORTE_PROC_MY_LIFELINE &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, ORTE_PROC_MY_LIFELINE)) {
|
||||
/* If we are not already inside orte_finalize, then call abort */
|
||||
if (!orte_finalizing) {
|
||||
/* Should free the peer lock before we abort so we don't
|
||||
* get stuck in the orte_wait_kill when receiving messages in the
|
||||
* tcp OOB. */
|
||||
@ -589,7 +588,6 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_LIFELINE));
|
||||
orte_errmgr.abort(1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
|
@ -603,12 +603,11 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
opal_dss.pack(&ack, &exit_code, 1, ORTE_EXIT_CODE);
|
||||
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
|
||||
OBJ_DESTRUCT(&ack);
|
||||
/* progress the OOB to ensure the message gets out */
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/* cleanup any lingering session directories */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* Finalize and clean up ourselves */
|
||||
/* Finalize and clean up ourselves */
|
||||
if (ORTE_SUCCESS != (ret = orte_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
@ -44,9 +44,12 @@ int orte_finalize(void)
|
||||
}
|
||||
|
||||
/* protect against multiple calls */
|
||||
if (!opal_atomic_trylock(&orte_finalize_lock)) { /* returns 1 if already locked */
|
||||
if (!opal_atomic_trylock(&orte_finalize_lock)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* set the flag indicating we are finalizing */
|
||||
orte_finalizing = true;
|
||||
|
||||
/* call the finalize function for this environment */
|
||||
orte_ess.finalize();
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user