From f0f98f13b646d42bffc2c5d207f45837faf5e779 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 6 Jan 2015 15:42:38 -0700 Subject: [PATCH] odls/base: fix an edge case with signals In the course of doing some testing with how orted's handle signaled child processes, found out that very often doing a kill -9 on a process on a node just results in the job hanging. The problem was that the orted odls/errmgr was not properly handling the exit_code being returned from waitpid. Now mark the proc state as ORTE_PROC_STATE_ABORTED_BY_SIG if the exit_code from waitpid indicates the process exited owing to a signal. --- orte/mca/odls/base/odls_base_default_fns.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index a61bbc6696..d1aa808306 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1649,16 +1649,20 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata) /* if the child was previously flagged as dead, then just * update its exit status and * ensure that its exit state gets reported to avoid hanging + * don't forget to check if the process was signaled. */ if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ALIVE)) { OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s odls:waitpid_fired child %s was already dead", + "%s odls:waitpid_fired child %s was already dead exit code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); + ORTE_NAME_PRINT(&proc->name),proc->exit_code)); if (WIFEXITED(proc->exit_code)) { proc->exit_code = WEXITSTATUS(proc->exit_code); } else { - proc->exit_code = WTERMSIG(proc->exit_code) + 128; + if (WIFSIGNALED(proc->exit_code)) { + state = ORTE_PROC_STATE_ABORTED_BY_SIG; + proc->exit_code = WTERMSIG(proc->exit_code) + 128; + } } goto MOVEON; }