If a tool sees the HNP it is attached to die (thereby losing connection), then stop the event loop instead of going through the abort code path. This will allow the tool to cleanup before exiting
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
16dc2e8c79
Коммит
19bb64cfb8
@ -9,7 +9,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -103,8 +103,14 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
/* all errors require abort */
|
||||
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
||||
/* if we lost our lifeline, then just stop the event loop
|
||||
* so the main program can cleanly terminate */
|
||||
if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
||||
orte_event_base_active = false;
|
||||
} else {
|
||||
/* all other errors require abort */
|
||||
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
||||
}
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -170,9 +170,14 @@ int orterun(int argc, char *argv[])
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
goto DONE;
|
||||
}
|
||||
while (1) {
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
/* we are terminated when the DVM master shuts down, thereby
|
||||
* closing our connection to them. This looks like an error,
|
||||
* but is not - so correct our exit status here */
|
||||
orte_exit_status = 0;
|
||||
goto DONE;
|
||||
} else {
|
||||
/* spawn the job and its daemons */
|
||||
memset(&launchst, 0, sizeof(launchst));
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user