Fix suicide operation when MPI app loses connection to its local daemon. In that scenario, we correctly callback up to the MPI layer notifying it of the lost connection. However, when the MPI layer calls back down to tell the RTE to abort, it is passing back a flag indicating we should report that error to our local daemon - which is dead. This leads to an infinite loop. Break it by using checking the flag indicating an abnormal term was ordered by the RTE and thus don't attempt to send the message.
cmr=v1.7.4:reviewer=jsquyres This commit was SVN r30475.
Этот коммит содержится в:
родитель
410a3afa7b
Коммит
4e3d12d9c1
@ -124,6 +124,13 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
orte_process_info.nodename,
|
||||
ORTE_NAME_PRINT(&caddy->name),
|
||||
(NULL == nodename) ? "Unknown" : nodename);
|
||||
/* flag that we must abnormally terminate as far as the
|
||||
* RTE is concerned
|
||||
*/
|
||||
orte_abnormal_term_ordered = true;
|
||||
} else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
||||
/* we need to die, so mark us so */
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
|
||||
orte_errmgr_base_execute_error_callbacks(&errors);
|
||||
|
@ -380,8 +380,11 @@ void orte_ess_base_app_abort(int status, bool report)
|
||||
/* CRS cleanup since it may have a named pipe and thread active */
|
||||
orte_cr_finalize();
|
||||
|
||||
/* If we were asked to report this termination, do so */
|
||||
if (report) {
|
||||
/* If we were asked to report this termination, do so - except
|
||||
* in cases of abnormal termination ordered by the RTE as
|
||||
* this means we can't rely on being able to communicate
|
||||
*/
|
||||
if (report && !orte_abnormal_term_ordered) {
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD);
|
||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user