1
1

Print a nice error message when a daemon fails, and exit with a non-zero status

This commit was SVN r23314.
Этот коммит содержится в:
Ralph Castain 2010-06-28 16:38:54 +00:00
родитель 1fad51776d
Коммит 3237b9ec87
2 изменённых файлов: 229 добавлений и 203 удалений

Просмотреть файл

@ -124,6 +124,7 @@ static int update_state(orte_jobid_t job,
orte_odls_child_t *child; orte_odls_child_t *child;
int rc; int rc;
orte_app_context_t *app; orte_app_context_t *app;
orte_proc_t *pdat;
/* indicate that this is the end of the line */ /* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE; *stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
@ -174,114 +175,114 @@ static int update_state(orte_jobid_t job,
orte_job_state_to_str(jobstate))); orte_job_state_to_str(jobstate)));
switch (jobstate) { switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START: case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jdata); failed_start(jdata);
check_job_complete(jdata); /* set the local proc states */ check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd /* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't * in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die * NULL, then we need to tell everyone else to die
*/ */
if (NULL != (jdata = orte_get_job_data_object(job))) { if (NULL != (jdata = orte_get_job_data_object(job))) {
sts = exit_code; sts = exit_code;
if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) {
/* set the flag indicating that a daemon failed so we use the proper /* set the flag indicating that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system * methods for attempting to shutdown the rest of the system
*/ */
orte_abnormal_term_ordered = true; orte_abnormal_term_ordered = true;
if (WIFSIGNALED(exit_code)) { /* died on signal */ if (WIFSIGNALED(exit_code)) { /* died on signal */
#ifdef WCOREDUMP #ifdef WCOREDUMP
if (WCOREDUMP(exit_code)) { if (WCOREDUMP(exit_code)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
WTERMSIG(exit_code)); WTERMSIG(exit_code));
sts = WTERMSIG(exit_code); sts = WTERMSIG(exit_code);
} else { } else {
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
WTERMSIG(exit_code));
sts = WTERMSIG(exit_code);
}
#else
orte_show_help("help-plm-base.txt", "daemon-died-signal", true, orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
WTERMSIG(exit_code)); WTERMSIG(exit_code));
sts = WTERMSIG(exit_code); sts = WTERMSIG(exit_code);
#endif /* WCOREDUMP */
} else {
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
WEXITSTATUS(exit_code));
sts = WEXITSTATUS(exit_code);
} }
} #else
hnp_abort(jdata->jobid, sts); orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
} WTERMSIG(exit_code));
break; sts = WTERMSIG(exit_code);
case ORTE_JOB_STATE_RUNNING: #endif /* WCOREDUMP */
/* update all procs in job */ } else {
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
/* record that we reported */ WEXITSTATUS(exit_code));
jdata->num_daemons_reported++; sts = WEXITSTATUS(exit_code);
/* report if requested */
if (orte_report_launch_progress) {
if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
} }
} }
break; hnp_abort(jdata->jobid, sts);
case ORTE_JOB_STATE_NEVER_LAUNCHED: }
orte_never_launched = true; break;
jdata->num_terminated = jdata->num_procs; case ORTE_JOB_STATE_RUNNING:
check_job_complete(jdata); /* set the local proc states */ /* update all procs in job */
/* the job object for this job will have been NULL'd update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0);
* in the array if the job was solely local. If it isn't /* record that we reported */
* NULL, then we need to tell everyone else to die jdata->num_daemons_reported++;
*/ /* report if requested */
if (NULL != (jdata = orte_get_job_data_object(job))) { if (orte_report_launch_progress) {
hnp_abort(jdata->jobid, exit_code); if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
} }
break; }
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: break;
/* update all procs in job */ case ORTE_JOB_STATE_NEVER_LAUNCHED:
update_local_procs_in_job(jdata, jobstate, orte_never_launched = true;
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, jdata->num_terminated = jdata->num_procs;
exit_code); check_job_complete(jdata); /* set the local proc states */
/* order all local procs for this job to be killed */ /* the job object for this job will have been NULL'd
killprocs(jdata->jobid, ORTE_VPID_WILDCARD); * in the array if the job was solely local. If it isn't
check_job_complete(jdata); /* set the local proc states */ * NULL, then we need to tell everyone else to die
/* the job object for this job will have been NULL'd */
* in the array if the job was solely local. If it isn't if (NULL != (jdata = orte_get_job_data_object(job))) {
* NULL, then we need to tell everyone else to die hnp_abort(jdata->jobid, exit_code);
*/ }
if (NULL != (jdata = orte_get_job_data_object(job))) { break;
hnp_abort(jdata->jobid, exit_code); case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
} /* update all procs in job */
break; update_local_procs_in_job(jdata, jobstate,
case ORTE_JOB_STATE_COMM_FAILED: ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
/* order all local procs for this job to be killed */ exit_code);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD); /* order all local procs for this job to be killed */
check_job_complete(jdata); /* set the local proc states */ killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
/* the job object for this job will have been NULL'd check_job_complete(jdata); /* set the local proc states */
* in the array if the job was solely local. If it isn't /* the job object for this job will have been NULL'd
* NULL, then we need to tell everyone else to die * in the array if the job was solely local. If it isn't
*/ * NULL, then we need to tell everyone else to die
if (NULL != (jdata = orte_get_job_data_object(job))) { */
hnp_abort(jdata->jobid, exit_code); if (NULL != (jdata = orte_get_job_data_object(job))) {
} hnp_abort(jdata->jobid, exit_code);
break; }
case ORTE_JOB_STATE_HEARTBEAT_FAILED: break;
/* order all local procs for this job to be killed */ case ORTE_JOB_STATE_COMM_FAILED:
killprocs(jdata->jobid, ORTE_VPID_WILDCARD); /* order all local procs for this job to be killed */
check_job_complete(jdata); /* set the local proc states */ killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
/* the job object for this job will have been NULL'd check_job_complete(jdata); /* set the local proc states */
* in the array if the job was solely local. If it isn't /* the job object for this job will have been NULL'd
* NULL, then we need to tell everyone else to die * in the array if the job was solely local. If it isn't
*/ * NULL, then we need to tell everyone else to die
if (NULL != (jdata = orte_get_job_data_object(job))) { */
hnp_abort(jdata->jobid, exit_code); if (NULL != (jdata = orte_get_job_data_object(job))) {
} hnp_abort(jdata->jobid, exit_code);
break; }
break;
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(job))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
default: default:
break; break;
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -294,135 +295,146 @@ static int update_state(orte_jobid_t job,
/* update is for a specific proc */ /* update is for a specific proc */
switch (state) { switch (state) {
case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC: case ORTE_PROC_STATE_TERM_WO_SYNC:
if (jdata->enable_recovery) { if (jdata->enable_recovery) {
/* is this a local proc */ /* is this a local proc */
if (NULL != (child = proc_is_local(proc))) { if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its local restart limit */ /* local proc - see if it has reached its local restart limit */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
if (child->restarts < app->max_local_restarts) { if (child->restarts < app->max_local_restarts) {
child->restarts++; child->restarts++;
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
}
/* let it fall thru to abort */
} else {
/* see if we can relocate it somewhere else */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
return ORTE_SUCCESS;
}
/* let it fall thru to abort */
} }
/* let it fall thru to abort */
} else { } else {
/* this is a remote process - see if we can relocate it */ /* see if we can relocate it somewhere else */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) { if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* guess not - let it fall thru to abort */ /* let it fall thru to abort */
} }
} else {
/* this is a remote process - see if we can relocate it */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
return ORTE_SUCCESS;
}
/* guess not - let it fall thru to abort */
} }
update_proc(jdata, proc, state, pid, exit_code); }
check_job_complete(jdata); /* need to set the job state */ update_proc(jdata, proc, state, pid, exit_code);
/* the job object for this job will have been NULL'd check_job_complete(jdata); /* need to set the job state */
* in the array if the job was solely local. If it isn't /* the job object for this job will have been NULL'd
* NULL, then we need to tell everyone else to die * in the array if the job was solely local. If it isn't
*/ * NULL, then we need to tell everyone else to die
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { */
hnp_abort(jdata->jobid, exit_code); if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
} hnp_abort(jdata->jobid, exit_code);
break; }
break;
case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_FAILED_TO_START:
case ORTE_PROC_STATE_CALLED_ABORT: case ORTE_PROC_STATE_CALLED_ABORT:
update_proc(jdata, proc, state, pid, exit_code); update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); check_job_complete(jdata);
/* the job object for this job will have been NULL'd /* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't * in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die * NULL, then we need to tell everyone else to die
*/ */
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
hnp_abort(jdata->jobid, exit_code); hnp_abort(jdata->jobid, exit_code);
} }
break; break;
case ORTE_PROC_STATE_REGISTERED: case ORTE_PROC_STATE_REGISTERED:
case ORTE_PROC_STATE_RUNNING: case ORTE_PROC_STATE_RUNNING:
update_proc(jdata, proc, state, pid, exit_code); update_proc(jdata, proc, state, pid, exit_code);
break; break;
case ORTE_PROC_STATE_LAUNCHED: case ORTE_PROC_STATE_LAUNCHED:
/* record the pid for this child */ /* record the pid for this child */
update_proc(jdata, proc, state, pid, exit_code); update_proc(jdata, proc, state, pid, exit_code);
break; break;
case ORTE_PROC_STATE_TERMINATED: case ORTE_PROC_STATE_TERMINATED:
case ORTE_PROC_STATE_KILLED_BY_CMD: case ORTE_PROC_STATE_KILLED_BY_CMD:
update_proc(jdata, proc, state, pid, exit_code); update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); check_job_complete(jdata);
break; break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
update_proc(jdata, proc, state, pid, exit_code); update_proc(jdata, proc, state, pid, exit_code);
killprocs(proc->jobid, proc->vpid); killprocs(proc->jobid, proc->vpid);
check_job_complete(jdata); /* need to set the job state */ check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd /* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't * in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die * NULL, then we need to tell everyone else to die
*/ */
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
hnp_abort(jdata->jobid, exit_code); hnp_abort(jdata->jobid, exit_code);
} }
break; break;
case ORTE_PROC_STATE_COMM_FAILED: case ORTE_PROC_STATE_COMM_FAILED:
/* is this to a daemon? */ /* is this to a daemon? */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
/* if we have ordered orteds to terminate, ignore this */ /* if we have ordered orteds to terminate, ignore this */
if (orte_orteds_term_ordered) { if (orte_orteds_term_ordered) {
break; break;
} }
/* if this is my own connection, ignore it */ /* if this is my own connection, ignore it */
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
break; break;
} }
if (orte_enable_recovery) { if (orte_enable_recovery) {
/* relocate its processes */ /* relocate its processes */
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) { if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) {
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
}
} else {
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
/* kill all local procs */ /* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */ /* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code); hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
} }
} else { } else {
/* delete the route */ if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
orte_routed.delete_route(proc); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
} orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
break; ORTE_VPID_PRINT(proc->vpid), "Unknown");
} else {
case ORTE_PROC_STATE_HEARTBEAT_FAILED: orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
/* heartbeats are only from daemons */ ORTE_VPID_PRINT(proc->vpid),
if (orte_enable_recovery) { (NULL == pdat->node) ? "Unknown" :
/* relocate its processes */ ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
} else { }
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
/* kill all local procs */ /* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */ /* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code); hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
return ORTE_ERR_UNRECOVERABLE;
} }
break; } else {
/* delete the route */
orte_routed.delete_route(proc);
}
break;
default: case ORTE_PROC_STATE_HEARTBEAT_FAILED:
break; /* heartbeats are only from daemons */
if (orte_enable_recovery) {
/* relocate its processes */
} else {
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
return ORTE_ERR_UNRECOVERABLE;
}
break;
default:
break;
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -17,3 +17,17 @@ except due to an internal ORTE error.
Job state: %s Job state: %s
This information should probably be reported to the OMPI developers. This information should probably be reported to the OMPI developers.
#
[errmgr-hnp:daemon-died]
The system has lost communication with the following daemon:
Daemon: %s
Node: %s
The reason for the lost communication channel is unknown. Possible
reasons include failure of the daemon itself, failure of the
connecting fabric/switch, and loss of the host node. Please
check with your system administrator to try and determine the
source of the problem.
Your job is being terminated as a result.