1
1

Add some diagnostics to chase down forced termination of procs. Ensure that procs are removed from the local data list upon termination

This commit was SVN r22223.
Этот коммит содержится в:
Ralph Castain 2009-11-19 19:43:10 +00:00
родитель 3921069230
Коммит a401f05ea3
2 изменённых файлов: 46 добавлений и 19 удалений

Просмотреть файл

@ -2338,7 +2338,7 @@ static bool any_live_children(orte_jobid_t job)
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID) &&
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
child->alive) {
return true;
}
@ -2382,6 +2382,25 @@ static void check_proc_complete(orte_odls_child_t *child)
/* setup the alert buffer */
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* find the jobdat */
jdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jdat->jobid == child->name->jobid) {
break;
}
}
if (NULL == jdat) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto unlock;
}
/* decrement the num_local_procs as this one is complete */
jdat->num_local_procs--;
/* if the proc aborted, tell the HNP right away */
if (ORTE_PROC_STATE_TERMINATED != child->state) {
/* pack update state command */
@ -2402,11 +2421,17 @@ static void check_proc_complete(orte_odls_child_t *child)
goto unlock;
}
/* remove the child from our local list as it is no longer alive */
opal_list_remove_item(&orte_local_children, &child->super);
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:proc_complete reporting proc %s aborted to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* release the child object */
OBJ_RELEASE(child);
/* if we are the HNP, then we would rather not send this to ourselves -
* instead, we queue it up for local processing
*/
@ -2431,21 +2456,6 @@ static void check_proc_complete(orte_odls_child_t *child)
goto unlock;
}
/* pack the data for the job */
jdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jdat->jobid == child->name->jobid) {
break;
}
}
if (NULL == jdat) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto unlock;
}
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, false, jdat))) {
ORTE_ERROR_LOG(rc);
goto unlock;
@ -2804,7 +2814,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool se
int i;
opal_pointer_array_t procarray, *procptr;
bool do_cleanup;
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
/* since we are going to be working with the global list of
@ -2898,7 +2908,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool se
}
/* remove the child from the list since it is either already dead or soon going to be dead */
opal_list_remove_item(&orte_local_children, item);
opal_list_remove_item(&orte_local_children, &child->super);
/* store the jobid, if required */
if (last_job != child->name->jobid) {

Просмотреть файл

@ -136,6 +136,9 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
do {
ret = waitpid(pid, exit_status, WNOHANG);
if (pid == ret) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:WAITPID INDICATES PROC %d IS DEAD",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
/* It died -- return success */
return true;
} else if (0 == ret) {
@ -147,10 +150,16 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
* by waitpid in this case, so we cannot check it - just assume
* the proc has indeed died
*/
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:WAITPID INDICATES PROC %d HAS ALREADY EXITED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
return true;
} else if (-1 == ret && ECHILD == errno) {
/* The pid no longer exists, so we'll call this "good
enough for government work" */
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
return true;
}
@ -174,8 +183,16 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
static int odls_default_kill_local(pid_t pid, int signum)
{
if (0 != kill(pid, signum)) {
if (ESRCH != errno) return errno;
if (ESRCH != errno) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:SENT KILL %d TO PID %d GOT ERRNO %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
return errno;
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:SENT KILL %d TO PID %d SUCCESS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
return 0;
}