Always followup with sigkill when killing local procs as procs can trap sigterm and get stuck
This commit was SVN r24719.
Этот коммит содержится в:
родитель
26034f4a96
Коммит
1b5ca323c6
@ -2980,9 +2980,9 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
orte_wait_cb_cancel(child->pid);
|
orte_wait_cb_cancel(child->pid);
|
||||||
|
|
||||||
/* First send a SIGCONT in case the process is in stopped state.
|
/* First send a SIGCONT in case the process is in stopped state.
|
||||||
If it is in a stopped state and we do not first change it to
|
If it is in a stopped state and we do not first change it to
|
||||||
running, then SIGTERM will not get delivered. Ignore return
|
running, then SIGTERM will not get delivered. Ignore return
|
||||||
value. */
|
value. */
|
||||||
kill_local(child->pid, SIGCONT);
|
kill_local(child->pid, SIGCONT);
|
||||||
|
|
||||||
/* Send a sigterm to the process before sigkill to be nice */
|
/* Send a sigterm to the process before sigkill to be nice */
|
||||||
@ -3000,16 +3000,14 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
"odls-default:could-not-kill",
|
"odls-default:could-not-kill",
|
||||||
true, orte_process_info.nodename, child->pid);
|
true, orte_process_info.nodename, child->pid);
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
#if OPAL_ENABLE_FT_CR
|
/* Force the SIGKILL just to make sure things are dead
|
||||||
/* Force the SIGKILL just to make sure things are dead
|
* This fixes an issue that, if the application is masking
|
||||||
* This fixes an issue with process migration/autorecovery
|
* SIGTERM, then the child_died()
|
||||||
* if the application is masking SIGTERM then the child_died()
|
* may return 'true' even though waipid returns with 0.
|
||||||
* may return 'true' even though waipid returns with 0.
|
* It does this to avoid a race condition, per documentation
|
||||||
* It does this to avoid a race condition, per documentation
|
* in odls_default_module.c.
|
||||||
* in odls_default_module.c.
|
*/
|
||||||
*/
|
|
||||||
else {
|
|
||||||
kill_local(child->pid, SIGKILL);
|
kill_local(child->pid, SIGKILL);
|
||||||
/* Double check that it actually died this time */
|
/* Double check that it actually died this time */
|
||||||
if (!child_died(child)) {
|
if (!child_died(child)) {
|
||||||
@ -3018,7 +3016,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
true, orte_process_info.nodename, child->pid);
|
true, orte_process_info.nodename, child->pid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:kill_local_proc child %s killed",
|
"%s odls:kill_local_proc child %s killed",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user