Add some diagnostics to chase down forced termination of procs. Ensure that procs are removed from the local data list upon termination
This commit was SVN r22223.
Этот коммит содержится в:
родитель
3921069230
Коммит
a401f05ea3
@ -2338,7 +2338,7 @@ static bool any_live_children(orte_jobid_t job)
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
/* is this child part of the specified job? */
|
||||
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID) &&
|
||||
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
|
||||
child->alive) {
|
||||
return true;
|
||||
}
|
||||
@ -2382,6 +2382,25 @@ static void check_proc_complete(orte_odls_child_t *child)
|
||||
/* setup the alert buffer */
|
||||
OBJ_CONSTRUCT(&alert, opal_buffer_t);
|
||||
|
||||
/* find the jobdat */
|
||||
jdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jdat = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jdat->jobid == child->name->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jdat) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto unlock;
|
||||
}
|
||||
/* decrement the num_local_procs as this one is complete */
|
||||
jdat->num_local_procs--;
|
||||
|
||||
/* if the proc aborted, tell the HNP right away */
|
||||
if (ORTE_PROC_STATE_TERMINATED != child->state) {
|
||||
/* pack update state command */
|
||||
@ -2402,11 +2421,17 @@ static void check_proc_complete(orte_odls_child_t *child)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* remove the child from our local list as it is no longer alive */
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:proc_complete reporting proc %s aborted to HNP",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
|
||||
/* release the child object */
|
||||
OBJ_RELEASE(child);
|
||||
|
||||
/* if we are the HNP, then we would rather not send this to ourselves -
|
||||
* instead, we queue it up for local processing
|
||||
*/
|
||||
@ -2431,21 +2456,6 @@ static void check_proc_complete(orte_odls_child_t *child)
|
||||
goto unlock;
|
||||
}
|
||||
/* pack the data for the job */
|
||||
jdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jdat = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jdat->jobid == child->name->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jdat) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto unlock;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, false, jdat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto unlock;
|
||||
@ -2804,7 +2814,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool se
|
||||
int i;
|
||||
opal_pointer_array_t procarray, *procptr;
|
||||
bool do_cleanup;
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
|
||||
|
||||
/* since we are going to be working with the global list of
|
||||
@ -2898,7 +2908,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool se
|
||||
}
|
||||
|
||||
/* remove the child from the list since it is either already dead or soon going to be dead */
|
||||
opal_list_remove_item(&orte_local_children, item);
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
|
||||
/* store the jobid, if required */
|
||||
if (last_job != child->name->jobid) {
|
||||
|
@ -136,6 +136,9 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
|
||||
do {
|
||||
ret = waitpid(pid, exit_status, WNOHANG);
|
||||
if (pid == ret) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:WAITPID INDICATES PROC %d IS DEAD",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
|
||||
/* It died -- return success */
|
||||
return true;
|
||||
} else if (0 == ret) {
|
||||
@ -147,10 +150,16 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
|
||||
* by waitpid in this case, so we cannot check it - just assume
|
||||
* the proc has indeed died
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:WAITPID INDICATES PROC %d HAS ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
|
||||
return true;
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid));
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -174,8 +183,16 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
|
||||
static int odls_default_kill_local(pid_t pid, int signum)
|
||||
{
|
||||
if (0 != kill(pid, signum)) {
|
||||
if (ESRCH != errno) return errno;
|
||||
if (ESRCH != errno) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:SENT KILL %d TO PID %d GOT ERRNO %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
|
||||
return errno;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:SENT KILL %d TO PID %d SUCCESS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user