1
1

Merge pull request #7032 from jjhursey/fix-sigkill-wait

Fix the sigkill timeout sleep to prevent SIGCHLD from preventing completion
Этот коммит содержится в:
Josh Hursey 2019-10-02 14:48:27 -05:00 коммит произвёл GitHub
родитель 7ddfa6950b 0e8a97c598
Коммит b774b47428
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23

Просмотреть файл

@ -1767,7 +1767,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
orte_proc_t *child; orte_proc_t *child;
opal_list_t procs_killed; opal_list_t procs_killed;
orte_proc_t *proc, proctmp; orte_proc_t *proc, proctmp;
int i, j; int i, j, ret;
opal_pointer_array_t procarray, *procptr; opal_pointer_array_t procarray, *procptr;
bool do_cleanup; bool do_cleanup;
orte_odls_quick_caddy_t *cd; orte_odls_quick_caddy_t *cd;
@ -1913,7 +1913,17 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
/* if we are issuing signals, then we need to wait a little /* if we are issuing signals, then we need to wait a little
* and send the next in sequence */ * and send the next in sequence */
if (0 < opal_list_get_size(&procs_killed)) { if (0 < opal_list_get_size(&procs_killed)) {
sleep(orte_odls_globals.timeout_before_sigkill); /* Wait a little. Do so in a loop since sleep() can be interrupted by a
* signal. Most likely SIGCHLD in this case */
ret = orte_odls_globals.timeout_before_sigkill;
while( ret > 0 ) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s Sleep %d sec (total = %d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ret, orte_odls_globals.timeout_before_sigkill));
ret = sleep(ret);
}
/* issue a SIGTERM to all */ /* issue a SIGTERM to all */
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) { OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
@ -1922,8 +1932,18 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
ORTE_NAME_PRINT(&cd->child->name))); ORTE_NAME_PRINT(&cd->child->name)));
kill_local(cd->child->pid, SIGTERM); kill_local(cd->child->pid, SIGTERM);
} }
/* wait a little again */
sleep(orte_odls_globals.timeout_before_sigkill); /* Wait a little. Do so in a loop since sleep() can be interrupted by a
* signal. Most likely SIGCHLD in this case */
ret = orte_odls_globals.timeout_before_sigkill;
while( ret > 0 ) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s Sleep %d sec (total = %d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ret, orte_odls_globals.timeout_before_sigkill));
ret = sleep(ret);
}
/* issue a SIGKILL to all */ /* issue a SIGKILL to all */
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) { OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,