Cleanup the forced termination a bit by restoring the delay before issuing the sigkill, and eliminating the large time loss spent checking if the proc died. The latter is responsible for a large number of test timeouts in MTT
Update alps component
Этот коммит содержится в:
родитель
9f43b23725
Коммит
0ba9572f9f
@ -164,70 +164,6 @@ orte_odls_base_module_t orte_odls_alps_module = {
|
||||
};
|
||||
|
||||
|
||||
static bool odls_alps_child_died(orte_proc_t *child)
|
||||
{
|
||||
time_t end;
|
||||
pid_t ret;
|
||||
|
||||
/* Because of rounding in time (which returns whole seconds) we
|
||||
* have to add 1 to our wait number: this means that we wait
|
||||
* somewhere between (target) and (target)+1 seconds. Otherwise,
|
||||
* the default 1s actually means 'somwhere between 0 and 1s'. */
|
||||
end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
|
||||
do {
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:alps:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
|
||||
orte_odls_globals.timeout_before_sigkill + 1));
|
||||
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
|
||||
if (child->pid == ret) {
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:alps:WAITPID INDICATES PROC %d IS DEAD",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* It died -- return success */
|
||||
return true;
|
||||
} else if (0 == ret) {
|
||||
/* with NOHANG specified, if a process has already exited
|
||||
* while waitpid was registered, then waitpid returns 0
|
||||
* as there is no error - this is a race condition problem
|
||||
* that occasionally causes us to incorrectly report a proc
|
||||
* as refusing to die. Unfortunately, errno may not be reset
|
||||
* by waitpid in this case, so we cannot check it.
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:alps:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:alps:WAITPID INDICATES PID %d NO LONGER EXISTS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Bogus delay for 1 msec - let's actually give the CPU some time
|
||||
* to quit the other process (sched_yield() -- even if we have it
|
||||
* -- changed behavior in 2.6.3x Linux flavors to be undesirable)
|
||||
* Don't use select on a bogus file descriptor here as it has proven
|
||||
* unreliable and sometimes immediately returns - we really, really
|
||||
* -do- want to wait a bit!
|
||||
*/
|
||||
usleep(1000);
|
||||
} while (time(NULL) < end);
|
||||
|
||||
/* The child didn't die, so return false */
|
||||
return false;
|
||||
}
|
||||
|
||||
static int odls_alps_kill_local(pid_t pid, int signum)
|
||||
{
|
||||
pid_t pgrp;
|
||||
@ -264,7 +200,7 @@ int orte_odls_alps_kill_local_procs(opal_pointer_array_t *procs)
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
|
||||
odls_alps_kill_local, odls_alps_child_died))) {
|
||||
odls_alps_kill_local))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -1366,9 +1366,57 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
orte_proc_t *child;
|
||||
orte_odls_base_kill_local_fn_t kill_local;
|
||||
} orte_odls_quick_caddy_t;
|
||||
static void qcdcon(orte_odls_quick_caddy_t *p)
|
||||
{
|
||||
p->child = NULL;
|
||||
p->kill_local = NULL;
|
||||
}
|
||||
static void qcddes(orte_odls_quick_caddy_t *p)
|
||||
{
|
||||
if (NULL != p->child) {
|
||||
OBJ_RELEASE(p->child);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
|
||||
opal_object_t,
|
||||
qcdcon, qcddes);
|
||||
|
||||
static void send_kill(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
||||
orte_odls_quick_caddy_t *cd = (orte_odls_quick_caddy_t*)tm->payload;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING FORCE SIGKILL TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&cd->child->name)));
|
||||
|
||||
cd->kill_local(cd->child->pid, SIGKILL);
|
||||
/* indicate the waitpid fired as this is effectively what
|
||||
* has happened
|
||||
*/
|
||||
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
|
||||
cd->child->pid = 0;
|
||||
|
||||
/* ensure the child's session directory is cleaned up */
|
||||
orte_session_dir_finalize(&cd->child->name);
|
||||
/* check for everything complete - this will remove
|
||||
* the child object from our local list
|
||||
*/
|
||||
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
|
||||
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
|
||||
}
|
||||
OBJ_RELEASE(cd);
|
||||
}
|
||||
|
||||
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
orte_odls_base_kill_local_fn_t kill_local,
|
||||
orte_odls_base_child_died_fn_t child_died)
|
||||
orte_odls_base_kill_local_fn_t kill_local)
|
||||
{
|
||||
orte_proc_t *child;
|
||||
opal_list_t procs_killed;
|
||||
@ -1376,6 +1424,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
int i, j;
|
||||
opal_pointer_array_t procarray, *procptr;
|
||||
bool do_cleanup;
|
||||
orte_odls_quick_caddy_t *cd;
|
||||
|
||||
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
|
||||
|
||||
@ -1492,22 +1541,30 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
*/
|
||||
orte_wait_cb_cancel(child);
|
||||
|
||||
/* Use SIGKILL just to make sure things are dead
|
||||
* This fixes an issue that, if the application is masking
|
||||
* SIGTERM, then the child_died() may return 'true' even
|
||||
* though waipid returns with 0. It does this to avoid a
|
||||
* race condition, per documentation in odls_default_module.c.
|
||||
*/
|
||||
/* First send a SIGCONT in case the process is in stopped state.
|
||||
If it is in a stopped state and we do not first change it to
|
||||
running, then SIGTERM will not get delivered. Ignore return
|
||||
value. */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING FORCE SIGKILL TO %s pid %lu",
|
||||
"%s SENDING SIGCONT TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name), (unsigned long)child->pid));
|
||||
kill_local(child->pid, SIGKILL);
|
||||
/* indicate the waitpid fired as this is effectively what
|
||||
* has happened
|
||||
*/
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
|
||||
child->pid = 0;
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGCONT);
|
||||
|
||||
/* Send a sigterm to the process before sigkill to be nice */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING SIGTERM TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGTERM);
|
||||
|
||||
cd = OBJ_NEW(orte_odls_quick_caddy_t);
|
||||
OBJ_RETAIN(child);
|
||||
cd->child = child;
|
||||
cd->kill_local = kill_local;
|
||||
ORTE_DETECT_TIMEOUT(1, orte_odls_globals.timeout_before_sigkill,
|
||||
10000000, send_kill, cd);
|
||||
continue;
|
||||
|
||||
CLEANUP:
|
||||
/* ensure the child's session directory is cleaned up */
|
||||
|
@ -121,8 +121,7 @@ typedef bool (*orte_odls_base_child_died_fn_t)(orte_proc_t *child);
|
||||
|
||||
ORTE_DECLSPEC int
|
||||
orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
orte_odls_base_kill_local_fn_t kill_local,
|
||||
orte_odls_base_child_died_fn_t child_died);
|
||||
orte_odls_base_kill_local_fn_t kill_local);
|
||||
|
||||
ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local);
|
||||
|
@ -164,71 +164,6 @@ orte_odls_base_module_t orte_odls_default_module = {
|
||||
};
|
||||
|
||||
|
||||
static bool odls_default_child_died(orte_proc_t *child)
|
||||
{
|
||||
time_t end;
|
||||
pid_t ret;
|
||||
|
||||
/* Because of rounding in time (which returns whole seconds) we
|
||||
* have to add 1 to our wait number: this means that we wait
|
||||
* somewhere between (target) and (target)+1 seconds. Otherwise,
|
||||
* the default 1s actually means 'somwhere between 0 and 1s'. */
|
||||
end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
|
||||
do {
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
|
||||
orte_odls_globals.timeout_before_sigkill + 1));
|
||||
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
|
||||
if (child->pid == ret) {
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PROC %d IS DEAD",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* It died -- return success */
|
||||
return true;
|
||||
} else if (0 == ret) {
|
||||
/* with NOHANG specified, if a process has already exited
|
||||
* while waitpid was registered, then waitpid returns 0
|
||||
* as there is no error - this is a race condition problem
|
||||
* that occasionally causes us to incorrectly report a proc
|
||||
* as refusing to die. Unfortunately, errno may not be reset
|
||||
* by waitpid in this case, so we cannot check it.
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Bogus delay for 1 msec - let's actually give the CPU some time
|
||||
* to quit the other process (sched_yield() -- even if we have it
|
||||
* -- changed behavior in 2.6.3x Linux flavors to be undesirable)
|
||||
* Don't use select on a bogus file descriptor here as it has proven
|
||||
* unreliable and sometimes immediately returns - we really, really
|
||||
* -do- want to wait a bit!
|
||||
*/
|
||||
usleep(1000);
|
||||
} while (time(NULL) < end);
|
||||
|
||||
/* The child didn't die, so return false */
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/* deliver a signal to a specified pid. */
|
||||
static int odls_default_kill_local(pid_t pid, int signum)
|
||||
{
|
||||
@ -251,7 +186,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs)
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
|
||||
odls_default_kill_local, odls_default_child_died))) {
|
||||
odls_default_kill_local))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user