Cleanup a race condition and an unreliable method that caused us to not properly handle procs that trapped sigterm for cleanup purposes while ORTE was trying to kill them. Thanks to Rick Payne and Ian Wells of Cisco for spending weeks chasing this down.
Fix a termination issue that caused procs local to mpirun to not be killed if they weren't calling into the library. Thanks to Terry Dontje for spending countless hours chasing his tail on this one! :-( This commit was SVN r25285.
Этот коммит содержится в:
родитель
2eaadcfab9
Коммит
054c485dcf
@ -2851,7 +2851,13 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
kill_local(child->pid, SIGTERM);
|
||||
|
||||
/* check to see if it died - the child_died function will continue
|
||||
* to check every microsecond until we reach the timeout
|
||||
* to check until we reach the timeout
|
||||
*
|
||||
* In practice, it doesn't matter what child_died reports
|
||||
* - we KILL the process anyway, to be sure it's dead.
|
||||
* However, what it does do is delay the KILL until either
|
||||
* the process is verified dead or the timeout elapsed,
|
||||
* which gives it time enough to shut down.
|
||||
*/
|
||||
if (!child_died(child)) {
|
||||
/* if it still isn't dead, try killing it one more time */
|
||||
|
@ -190,10 +190,12 @@ static bool odls_default_child_died(orte_odls_child_t *child)
|
||||
{
|
||||
time_t end;
|
||||
pid_t ret;
|
||||
struct timeval t;
|
||||
fd_set bogus;
|
||||
|
||||
end = time(NULL) + orte_odls_globals.timeout_before_sigkill;
|
||||
/* Because of rounding in time (which returns whole seconds) we
|
||||
* have to add 1 to our wait number: this means that we wait
|
||||
* somewhere between (target) and (target)+1 seconds. Otherwise,
|
||||
* the default 1s actually means 'somwhere between 0 and 1s'. */
|
||||
end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
|
||||
do {
|
||||
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
|
||||
if (child->pid == ret) {
|
||||
@ -208,13 +210,17 @@ static bool odls_default_child_died(orte_odls_child_t *child)
|
||||
* as there is no error - this is a race condition problem
|
||||
* that occasionally causes us to incorrectly report a proc
|
||||
* as refusing to die. Unfortunately, errno may not be reset
|
||||
* by waitpid in this case, so we cannot check it - just assume
|
||||
* the proc has indeed died
|
||||
* by waitpid in this case, so we cannot check it.
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:WAITPID INDICATES PROC %d HAS ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
return true;
|
||||
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
@ -224,14 +230,14 @@ static bool odls_default_child_died(orte_odls_child_t *child)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Bogus delay for 1 usec (sched_yeild() -- even if we have it
|
||||
-- changed behavior in 2.6.3x Linux flavors to be
|
||||
undesirable. */
|
||||
t.tv_sec = 0;
|
||||
t.tv_usec = 1;
|
||||
FD_ZERO(&bogus);
|
||||
FD_SET(0, &bogus);
|
||||
select(1, &bogus, NULL, NULL, &t);
|
||||
/* Bogus delay for 1 msec - let's actually give the CPU some time
|
||||
* to quit the other process (sched_yield() -- even if we have it
|
||||
* -- changed behavior in 2.6.3x Linux flavors to be undesirable)
|
||||
* Don't use select on a bogus file descriptor here as it has proven
|
||||
* unreliable and sometimes immediately returns - we really, really
|
||||
* -do- want to wait a bit!
|
||||
*/
|
||||
usleep(1000);
|
||||
} while (time(NULL) < end);
|
||||
|
||||
/* The child didn't die, so return false */
|
||||
|
@ -739,7 +739,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
||||
|
||||
/**** EXIT COMMAND ****/
|
||||
case ORTE_DAEMON_EXIT_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
if (1) {
|
||||
opal_output(0, "%s orted_cmd: received exit cmd",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
@ -757,10 +757,12 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
||||
|
||||
/**** HALT VM COMMAND ****/
|
||||
case ORTE_DAEMON_HALT_VM_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
if (1) {
|
||||
opal_output(0, "%s orted_cmd: received halt vm",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
/* kill the local procs */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
/* trigger our appropriate exit procedure
|
||||
* NOTE: this event will fire -after- any zero-time events
|
||||
* so any pending relays -do- get sent first
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user