1
1

Eliminate a race condition (first reported by Josh) when deliberately killing procs. Need to cancel the waitpid callback for the proc, then properly flag it as dead (both not-alive and waitpid-fired) so that the system cleans up properly.

This commit was SVN r22900.
Этот коммит содержится в:
Ralph Castain 2010-03-28 16:08:05 +00:00
родитель 4f9db20d94
Коммит 2603bd8a47
4 изменённых файлов: 31 добавлений и 17 удалений

Просмотреть файл

@ -106,6 +106,10 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
/* get a pointer to the job map */
map = jdata->map;
/* if there is no map, just return */
if (NULL == map) {
return ORTE_SUCCESS;
}
/* are we passing a regexp? */
if (orte_use_regexp && jdata->num_apps < 2 && NULL == orte_debugger_daemon) {
@ -2971,13 +2975,13 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
{
orte_odls_child_t *child;
opal_list_item_t *item;
int rc = ORTE_SUCCESS, exit_status = 0;
int rc = ORTE_SUCCESS;
opal_list_t procs_killed;
orte_proc_t *proc, proctmp;
int i;
opal_pointer_array_t procarray, *procptr;
bool do_cleanup;
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
/* since we are going to be working with the global list of
@ -3092,6 +3096,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
*/
orte_iof.close(child->name, ORTE_IOF_STDIN);
/* cancel the waitpid callback as this induces unmanageable race
* conditions when we are deliberately killing the process
*/
orte_wait_cb_cancel(child->pid);
/* First send a SIGCONT in case the process is in stopped state.
If it is in a stopped state and we do not first change it to
running, then SIGTERM will not get delivered. Ignore return
@ -3104,11 +3113,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
/* check to see if it died - the child_died function will continue
* to check every microsecond until we reach the timeout
*/
if (!child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) {
if (!child_died(child)) {
/* if it still isn't dead, try killing it one more time */
kill_local(child->pid, SIGKILL);
/* Double check that it actually died this time */
if (!child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) {
if (!child_died(child)) {
orte_show_help("help-odls-default.txt",
"odls-default:could-not-kill",
true, orte_process_info.nodename, child->pid);
@ -3119,13 +3128,19 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* set the process to "not alive" */
/* indicate the waitpid fired as this is effectively what
* has happened
*/
child->waitpid_recvd = true;
/* ensure the process is flagged as "not alive" */
child->alive = false;
CLEANUP:
/* ensure the child's session directory is cleaned up */
orte_session_dir_finalize(child->name);
/* check for everything complete */
/* check for everything complete - this will remove
* the child object from our local list
*/
check_proc_complete(child);
}
}

Просмотреть файл

@ -127,7 +127,7 @@ orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32
typedef int (*orte_odls_base_kill_local_fn_t)(pid_t pid, int signum);
/* define a function type to detect that a child died */
typedef bool (*orte_odls_base_child_died_fn_t)(pid_t pid, unsigned int timeout, int *exit_status);
typedef bool (*orte_odls_base_child_died_fn_t)(orte_odls_child_t *child);
ORTE_DECLSPEC int
orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,

Просмотреть файл

@ -125,7 +125,7 @@ orte_odls_base_module_t orte_odls_default_module = {
} \
} while(0);
static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_status)
static bool odls_default_child_died(orte_odls_child_t *child)
{
time_t end;
pid_t ret;
@ -134,13 +134,13 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
fd_set bogus;
#endif
end = time(NULL) + timeout;
end = time(NULL) + orte_odls_globals.timeout_before_sigkill;
do {
ret = waitpid(pid, exit_status, WNOHANG);
if (pid == ret) {
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
if (child->pid == ret) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:WAITPID INDICATES PROC %d IS DEAD",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
/* It died -- return success */
return true;
} else if (0 == ret) {
@ -154,14 +154,14 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
*/
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:WAITPID INDICATES PROC %d HAS ALREADY EXITED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
return true;
} else if (-1 == ret && ECHILD == errno) {
/* The pid no longer exists, so we'll call this "good
enough for government work" */
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
return true;
}

Просмотреть файл

@ -50,12 +50,11 @@
static void set_handler_default(int sig);
static bool odls_process_child_died( pid_t pid, unsigned int timeout,
int* exit_status )
static bool odls_process_child_died( orte_odls_child_t *child )
{
int error;
HANDLE handle = OpenProcess( PROCESS_TERMINATE | SYNCHRONIZE, FALSE,
(DWORD)pid );
(DWORD)(child->pid) );
if( INVALID_HANDLE_VALUE == handle ) {
error = GetLastError();
/* Let's suppose that the process dissapear ... by now */