Merge pull request #7032 from jjhursey/fix-sigkill-wait
Fix the sigkill timeout sleep to prevent SIGCHLD from preventing completion
Этот коммит содержится в:
Коммит
b774b47428
@ -1767,7 +1767,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
orte_proc_t *child;
|
orte_proc_t *child;
|
||||||
opal_list_t procs_killed;
|
opal_list_t procs_killed;
|
||||||
orte_proc_t *proc, proctmp;
|
orte_proc_t *proc, proctmp;
|
||||||
int i, j;
|
int i, j, ret;
|
||||||
opal_pointer_array_t procarray, *procptr;
|
opal_pointer_array_t procarray, *procptr;
|
||||||
bool do_cleanup;
|
bool do_cleanup;
|
||||||
orte_odls_quick_caddy_t *cd;
|
orte_odls_quick_caddy_t *cd;
|
||||||
@ -1913,7 +1913,17 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
/* if we are issuing signals, then we need to wait a little
|
/* if we are issuing signals, then we need to wait a little
|
||||||
* and send the next in sequence */
|
* and send the next in sequence */
|
||||||
if (0 < opal_list_get_size(&procs_killed)) {
|
if (0 < opal_list_get_size(&procs_killed)) {
|
||||||
sleep(orte_odls_globals.timeout_before_sigkill);
|
/* Wait a little. Do so in a loop since sleep() can be interrupted by a
|
||||||
|
* signal. Most likely SIGCHLD in this case */
|
||||||
|
ret = orte_odls_globals.timeout_before_sigkill;
|
||||||
|
while( ret > 0 ) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
|
"%s Sleep %d sec (total = %d)",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ret, orte_odls_globals.timeout_before_sigkill));
|
||||||
|
ret = sleep(ret);
|
||||||
|
}
|
||||||
|
|
||||||
/* issue a SIGTERM to all */
|
/* issue a SIGTERM to all */
|
||||||
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
|
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
@ -1922,8 +1932,18 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
ORTE_NAME_PRINT(&cd->child->name)));
|
ORTE_NAME_PRINT(&cd->child->name)));
|
||||||
kill_local(cd->child->pid, SIGTERM);
|
kill_local(cd->child->pid, SIGTERM);
|
||||||
}
|
}
|
||||||
/* wait a little again */
|
|
||||||
sleep(orte_odls_globals.timeout_before_sigkill);
|
/* Wait a little. Do so in a loop since sleep() can be interrupted by a
|
||||||
|
* signal. Most likely SIGCHLD in this case */
|
||||||
|
ret = orte_odls_globals.timeout_before_sigkill;
|
||||||
|
while( ret > 0 ) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
|
"%s Sleep %d sec (total = %d)",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ret, orte_odls_globals.timeout_before_sigkill));
|
||||||
|
ret = sleep(ret);
|
||||||
|
}
|
||||||
|
|
||||||
/* issue a SIGKILL to all */
|
/* issue a SIGKILL to all */
|
||||||
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
|
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user