1
1

Update signal handling to introduce a pause between SIGCONT and SIGTERM, followed by another pause before SIGKILL. Do this within the odls/kill_local_procs function while we know we are blocked in an event, and before the daemon shuts down the event progress loop

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2016-12-06 12:34:42 -08:00
родитель a3e4c33f0e
Коммит 85a634926b
3 изменённых файлов: 60 добавлений и 51 удалений

Просмотреть файл

@ -1379,14 +1379,12 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
}
typedef struct {
opal_object_t super;
opal_list_item_t super;
orte_proc_t *child;
orte_odls_base_kill_local_fn_t kill_local;
} orte_odls_quick_caddy_t;
static void qcdcon(orte_odls_quick_caddy_t *p)
{
p->child = NULL;
p->kill_local = NULL;
}
static void qcddes(orte_odls_quick_caddy_t *p)
{
@ -1395,38 +1393,9 @@ static void qcddes(orte_odls_quick_caddy_t *p)
}
}
OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
opal_object_t,
opal_list_item_t,
qcdcon, qcddes);
static void send_kill(int sd, short args, void *cbdata)
{
orte_timer_t *tm = (orte_timer_t*)cbdata;
orte_odls_quick_caddy_t *cd = (orte_odls_quick_caddy_t*)tm->payload;
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s SENDING FORCE SIGKILL TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&cd->child->name)));
cd->kill_local(cd->child->pid, SIGKILL);
/* indicate the waitpid fired as this is effectively what
* has happened
*/
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
cd->child->pid = 0;
/* ensure the child's session directory is cleaned up */
orte_session_dir_finalize(&cd->child->name);
/* check for everything complete - this will remove
* the child object from our local list
*/
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
}
OBJ_RELEASE(cd);
}
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
orte_odls_base_kill_local_fn_t kill_local)
{
@ -1536,11 +1505,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
}
}
/* mark the child as "killed" since the waitpid will
* fire as soon as we kill it
*/
child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
/* ensure the stdin IOF channel for this child is closed. The other
* channels will automatically close when the proc is killed
*/
@ -1561,21 +1525,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
"%s SENDING SIGCONT TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name)));
kill_local(child->pid, SIGCONT);
/* Send a sigterm to the process before sigkill to be nice */
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s SENDING SIGTERM TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name)));
kill_local(child->pid, SIGTERM);
cd = OBJ_NEW(orte_odls_quick_caddy_t);
OBJ_RETAIN(child);
cd->child = child;
cd->kill_local = kill_local;
ORTE_DETECT_TIMEOUT(1, orte_odls_globals.timeout_before_sigkill,
10000000, send_kill, cd);
opal_list_append(&procs_killed, &cd->super);
kill_local(child->pid, SIGCONT);
continue;
CLEANUP:
@ -1591,7 +1545,50 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
}
}
/* cleanup, if required */
/* if we are issuing signals, then we need to wait a little
* and send the next in sequence */
if (0 < opal_list_get_size(&procs_killed)) {
sleep(orte_odls_globals.timeout_before_sigkill);
/* issue a SIGTERM to all */
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s SENDING SIGTERM TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name)));
kill_local(cd->child->pid, SIGTERM);
}
/* wait a little again */
sleep(orte_odls_globals.timeout_before_sigkill);
/* issue a SIGKILL to all */
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s SENDING SIGKILL TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name)));
kill_local(cd->child->pid, SIGKILL);
/* indicate the waitpid fired as this is effectively what
* has happened
*/
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
cd->child->pid = 0;
/* mark the child as "killed" */
cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
/* ensure the child's session directory is cleaned up */
orte_session_dir_finalize(&cd->child->name);
/* check for everything complete - this will remove
* the child object from our local list
*/
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
}
}
}
OPAL_LIST_DESTRUCT(&procs_killed);
/* cleanup arrays, if required */
if (do_cleanup) {
OBJ_DESTRUCT(&procarray);
OBJ_DESTRUCT(&proctmp);

Просмотреть файл

@ -329,9 +329,11 @@ static int do_child(orte_app_context_t* context,
long fd, fdmax = sysconf(_SC_OPEN_MAX);
char *param, *msg;
#if HAVE_SETPGID
/* Set a new process group for this child, so that any
* signals we send to it will reach any children it spawns */
setpgid(0, 0);
#endif
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);

Просмотреть файл

@ -28,6 +28,10 @@ void sigusr_handler(int signum)
fprintf(stderr, "%s Trapped SIGUSR2\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
case SIGCONT:
fprintf(stderr, "%s Trapped SIGCONT\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
default:
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
return;
@ -55,6 +59,7 @@ void exit_handler(int signum)
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
break;
}
return;
exit(1);
}
@ -79,6 +84,11 @@ int main(int argc, char* argv[])
exit(1);
}
if (signal(SIGCONT, sigusr_handler) == SIG_IGN) {
fprintf(stderr, "Could not setup signal trap for SIGUSR2\n");
exit(1);
}
if (signal(SIGINT, exit_handler) == SIG_IGN) {
fprintf(stderr, "Could not setup signal trap for SIGINT\n");
exit(1);