Update signal handling to introduce a pause between SIGCONT and SIGTERM, followed by another pause before SIGKILL. Do this within the odls/kill_local_procs function while we know we are blocked in an event, and before the daemon shuts down the event progress loop
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
a3e4c33f0e
Коммит
85a634926b
@ -1379,14 +1379,12 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
opal_object_t super;
|
opal_list_item_t super;
|
||||||
orte_proc_t *child;
|
orte_proc_t *child;
|
||||||
orte_odls_base_kill_local_fn_t kill_local;
|
|
||||||
} orte_odls_quick_caddy_t;
|
} orte_odls_quick_caddy_t;
|
||||||
static void qcdcon(orte_odls_quick_caddy_t *p)
|
static void qcdcon(orte_odls_quick_caddy_t *p)
|
||||||
{
|
{
|
||||||
p->child = NULL;
|
p->child = NULL;
|
||||||
p->kill_local = NULL;
|
|
||||||
}
|
}
|
||||||
static void qcddes(orte_odls_quick_caddy_t *p)
|
static void qcddes(orte_odls_quick_caddy_t *p)
|
||||||
{
|
{
|
||||||
@ -1395,38 +1393,9 @@ static void qcddes(orte_odls_quick_caddy_t *p)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
|
OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
|
||||||
opal_object_t,
|
opal_list_item_t,
|
||||||
qcdcon, qcddes);
|
qcdcon, qcddes);
|
||||||
|
|
||||||
static void send_kill(int sd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
|
||||||
orte_odls_quick_caddy_t *cd = (orte_odls_quick_caddy_t*)tm->payload;
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
|
||||||
"%s SENDING FORCE SIGKILL TO %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&cd->child->name)));
|
|
||||||
|
|
||||||
cd->kill_local(cd->child->pid, SIGKILL);
|
|
||||||
/* indicate the waitpid fired as this is effectively what
|
|
||||||
* has happened
|
|
||||||
*/
|
|
||||||
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
|
|
||||||
cd->child->pid = 0;
|
|
||||||
|
|
||||||
/* ensure the child's session directory is cleaned up */
|
|
||||||
orte_session_dir_finalize(&cd->child->name);
|
|
||||||
/* check for everything complete - this will remove
|
|
||||||
* the child object from our local list
|
|
||||||
*/
|
|
||||||
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
|
|
||||||
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(cd);
|
|
||||||
}
|
|
||||||
|
|
||||||
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||||
orte_odls_base_kill_local_fn_t kill_local)
|
orte_odls_base_kill_local_fn_t kill_local)
|
||||||
{
|
{
|
||||||
@ -1536,11 +1505,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* mark the child as "killed" since the waitpid will
|
|
||||||
* fire as soon as we kill it
|
|
||||||
*/
|
|
||||||
child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
|
|
||||||
|
|
||||||
/* ensure the stdin IOF channel for this child is closed. The other
|
/* ensure the stdin IOF channel for this child is closed. The other
|
||||||
* channels will automatically close when the proc is killed
|
* channels will automatically close when the proc is killed
|
||||||
*/
|
*/
|
||||||
@ -1561,21 +1525,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
"%s SENDING SIGCONT TO %s",
|
"%s SENDING SIGCONT TO %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&child->name)));
|
ORTE_NAME_PRINT(&child->name)));
|
||||||
kill_local(child->pid, SIGCONT);
|
|
||||||
|
|
||||||
/* Send a sigterm to the process before sigkill to be nice */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
|
||||||
"%s SENDING SIGTERM TO %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&child->name)));
|
|
||||||
kill_local(child->pid, SIGTERM);
|
|
||||||
|
|
||||||
cd = OBJ_NEW(orte_odls_quick_caddy_t);
|
cd = OBJ_NEW(orte_odls_quick_caddy_t);
|
||||||
OBJ_RETAIN(child);
|
OBJ_RETAIN(child);
|
||||||
cd->child = child;
|
cd->child = child;
|
||||||
cd->kill_local = kill_local;
|
opal_list_append(&procs_killed, &cd->super);
|
||||||
ORTE_DETECT_TIMEOUT(1, orte_odls_globals.timeout_before_sigkill,
|
kill_local(child->pid, SIGCONT);
|
||||||
10000000, send_kill, cd);
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
@ -1591,7 +1545,50 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* cleanup, if required */
|
/* if we are issuing signals, then we need to wait a little
|
||||||
|
* and send the next in sequence */
|
||||||
|
if (0 < opal_list_get_size(&procs_killed)) {
|
||||||
|
sleep(orte_odls_globals.timeout_before_sigkill);
|
||||||
|
/* issue a SIGTERM to all */
|
||||||
|
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
|
"%s SENDING SIGTERM TO %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&child->name)));
|
||||||
|
kill_local(cd->child->pid, SIGTERM);
|
||||||
|
}
|
||||||
|
/* wait a little again */
|
||||||
|
sleep(orte_odls_globals.timeout_before_sigkill);
|
||||||
|
/* issue a SIGKILL to all */
|
||||||
|
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
|
"%s SENDING SIGKILL TO %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&child->name)));
|
||||||
|
kill_local(cd->child->pid, SIGKILL);
|
||||||
|
/* indicate the waitpid fired as this is effectively what
|
||||||
|
* has happened
|
||||||
|
*/
|
||||||
|
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
|
||||||
|
cd->child->pid = 0;
|
||||||
|
|
||||||
|
/* mark the child as "killed" */
|
||||||
|
cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
|
||||||
|
|
||||||
|
/* ensure the child's session directory is cleaned up */
|
||||||
|
orte_session_dir_finalize(&cd->child->name);
|
||||||
|
/* check for everything complete - this will remove
|
||||||
|
* the child object from our local list
|
||||||
|
*/
|
||||||
|
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
|
||||||
|
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OPAL_LIST_DESTRUCT(&procs_killed);
|
||||||
|
|
||||||
|
/* cleanup arrays, if required */
|
||||||
if (do_cleanup) {
|
if (do_cleanup) {
|
||||||
OBJ_DESTRUCT(&procarray);
|
OBJ_DESTRUCT(&procarray);
|
||||||
OBJ_DESTRUCT(&proctmp);
|
OBJ_DESTRUCT(&proctmp);
|
||||||
|
@ -329,9 +329,11 @@ static int do_child(orte_app_context_t* context,
|
|||||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||||
char *param, *msg;
|
char *param, *msg;
|
||||||
|
|
||||||
|
#if HAVE_SETPGID
|
||||||
/* Set a new process group for this child, so that any
|
/* Set a new process group for this child, so that any
|
||||||
* signals we send to it will reach any children it spawns */
|
* signals we send to it will reach any children it spawns */
|
||||||
setpgid(0, 0);
|
setpgid(0, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Setup the pipe to be close-on-exec */
|
/* Setup the pipe to be close-on-exec */
|
||||||
opal_fd_set_cloexec(write_fd);
|
opal_fd_set_cloexec(write_fd);
|
||||||
|
@ -28,6 +28,10 @@ void sigusr_handler(int signum)
|
|||||||
fprintf(stderr, "%s Trapped SIGUSR2\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
fprintf(stderr, "%s Trapped SIGUSR2\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
case SIGCONT:
|
||||||
|
fprintf(stderr, "%s Trapped SIGCONT\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
return;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
|
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
|
||||||
return;
|
return;
|
||||||
@ -55,6 +59,7 @@ void exit_handler(int signum)
|
|||||||
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
|
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
|
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -79,6 +84,11 @@ int main(int argc, char* argv[])
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (signal(SIGCONT, sigusr_handler) == SIG_IGN) {
|
||||||
|
fprintf(stderr, "Could not setup signal trap for SIGUSR2\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
if (signal(SIGINT, exit_handler) == SIG_IGN) {
|
if (signal(SIGINT, exit_handler) == SIG_IGN) {
|
||||||
fprintf(stderr, "Could not setup signal trap for SIGINT\n");
|
fprintf(stderr, "Could not setup signal trap for SIGINT\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user