Print a nice error message when a daemon fails, and exit with a non-zero status
This commit was SVN r23314.
Этот коммит содержится в:
родитель
1fad51776d
Коммит
3237b9ec87
@ -124,6 +124,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
orte_odls_child_t *child;
|
orte_odls_child_t *child;
|
||||||
int rc;
|
int rc;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
|
orte_proc_t *pdat;
|
||||||
|
|
||||||
/* indicate that this is the end of the line */
|
/* indicate that this is the end of the line */
|
||||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||||
@ -174,114 +175,114 @@ static int update_state(orte_jobid_t job,
|
|||||||
orte_job_state_to_str(jobstate)));
|
orte_job_state_to_str(jobstate)));
|
||||||
|
|
||||||
switch (jobstate) {
|
switch (jobstate) {
|
||||||
case ORTE_JOB_STATE_FAILED_TO_START:
|
case ORTE_JOB_STATE_FAILED_TO_START:
|
||||||
failed_start(jdata);
|
failed_start(jdata);
|
||||||
check_job_complete(jdata); /* set the local proc states */
|
check_job_complete(jdata); /* set the local proc states */
|
||||||
/* the job object for this job will have been NULL'd
|
/* the job object for this job will have been NULL'd
|
||||||
* in the array if the job was solely local. If it isn't
|
* in the array if the job was solely local. If it isn't
|
||||||
* NULL, then we need to tell everyone else to die
|
* NULL, then we need to tell everyone else to die
|
||||||
*/
|
*/
|
||||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||||
sts = exit_code;
|
sts = exit_code;
|
||||||
if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) {
|
if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) {
|
||||||
/* set the flag indicating that a daemon failed so we use the proper
|
/* set the flag indicating that a daemon failed so we use the proper
|
||||||
* methods for attempting to shutdown the rest of the system
|
* methods for attempting to shutdown the rest of the system
|
||||||
*/
|
*/
|
||||||
orte_abnormal_term_ordered = true;
|
orte_abnormal_term_ordered = true;
|
||||||
if (WIFSIGNALED(exit_code)) { /* died on signal */
|
if (WIFSIGNALED(exit_code)) { /* died on signal */
|
||||||
#ifdef WCOREDUMP
|
#ifdef WCOREDUMP
|
||||||
if (WCOREDUMP(exit_code)) {
|
if (WCOREDUMP(exit_code)) {
|
||||||
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
|
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
|
||||||
WTERMSIG(exit_code));
|
WTERMSIG(exit_code));
|
||||||
sts = WTERMSIG(exit_code);
|
sts = WTERMSIG(exit_code);
|
||||||
} else {
|
} else {
|
||||||
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
|
||||||
WTERMSIG(exit_code));
|
|
||||||
sts = WTERMSIG(exit_code);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
||||||
WTERMSIG(exit_code));
|
WTERMSIG(exit_code));
|
||||||
sts = WTERMSIG(exit_code);
|
sts = WTERMSIG(exit_code);
|
||||||
#endif /* WCOREDUMP */
|
|
||||||
} else {
|
|
||||||
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
|
|
||||||
WEXITSTATUS(exit_code));
|
|
||||||
sts = WEXITSTATUS(exit_code);
|
|
||||||
}
|
}
|
||||||
}
|
#else
|
||||||
hnp_abort(jdata->jobid, sts);
|
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
||||||
}
|
WTERMSIG(exit_code));
|
||||||
break;
|
sts = WTERMSIG(exit_code);
|
||||||
case ORTE_JOB_STATE_RUNNING:
|
#endif /* WCOREDUMP */
|
||||||
/* update all procs in job */
|
} else {
|
||||||
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0);
|
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
|
||||||
/* record that we reported */
|
WEXITSTATUS(exit_code));
|
||||||
jdata->num_daemons_reported++;
|
sts = WEXITSTATUS(exit_code);
|
||||||
/* report if requested */
|
|
||||||
if (orte_report_launch_progress) {
|
|
||||||
if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
|
|
||||||
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
|
|
||||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
|
||||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
hnp_abort(jdata->jobid, sts);
|
||||||
case ORTE_JOB_STATE_NEVER_LAUNCHED:
|
}
|
||||||
orte_never_launched = true;
|
break;
|
||||||
jdata->num_terminated = jdata->num_procs;
|
case ORTE_JOB_STATE_RUNNING:
|
||||||
check_job_complete(jdata); /* set the local proc states */
|
/* update all procs in job */
|
||||||
/* the job object for this job will have been NULL'd
|
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0);
|
||||||
* in the array if the job was solely local. If it isn't
|
/* record that we reported */
|
||||||
* NULL, then we need to tell everyone else to die
|
jdata->num_daemons_reported++;
|
||||||
*/
|
/* report if requested */
|
||||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
if (orte_report_launch_progress) {
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
|
||||||
|
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||||
|
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||||
|
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
break;
|
||||||
/* update all procs in job */
|
case ORTE_JOB_STATE_NEVER_LAUNCHED:
|
||||||
update_local_procs_in_job(jdata, jobstate,
|
orte_never_launched = true;
|
||||||
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
|
jdata->num_terminated = jdata->num_procs;
|
||||||
exit_code);
|
check_job_complete(jdata); /* set the local proc states */
|
||||||
/* order all local procs for this job to be killed */
|
/* the job object for this job will have been NULL'd
|
||||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
* in the array if the job was solely local. If it isn't
|
||||||
check_job_complete(jdata); /* set the local proc states */
|
* NULL, then we need to tell everyone else to die
|
||||||
/* the job object for this job will have been NULL'd
|
*/
|
||||||
* in the array if the job was solely local. If it isn't
|
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||||
* NULL, then we need to tell everyone else to die
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
*/
|
}
|
||||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
break;
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
|
||||||
}
|
/* update all procs in job */
|
||||||
break;
|
update_local_procs_in_job(jdata, jobstate,
|
||||||
case ORTE_JOB_STATE_COMM_FAILED:
|
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
|
||||||
/* order all local procs for this job to be killed */
|
exit_code);
|
||||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
/* order all local procs for this job to be killed */
|
||||||
check_job_complete(jdata); /* set the local proc states */
|
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||||
/* the job object for this job will have been NULL'd
|
check_job_complete(jdata); /* set the local proc states */
|
||||||
* in the array if the job was solely local. If it isn't
|
/* the job object for this job will have been NULL'd
|
||||||
* NULL, then we need to tell everyone else to die
|
* in the array if the job was solely local. If it isn't
|
||||||
*/
|
* NULL, then we need to tell everyone else to die
|
||||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
*/
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||||
}
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
break;
|
}
|
||||||
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
break;
|
||||||
/* order all local procs for this job to be killed */
|
case ORTE_JOB_STATE_COMM_FAILED:
|
||||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
/* order all local procs for this job to be killed */
|
||||||
check_job_complete(jdata); /* set the local proc states */
|
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||||
/* the job object for this job will have been NULL'd
|
check_job_complete(jdata); /* set the local proc states */
|
||||||
* in the array if the job was solely local. If it isn't
|
/* the job object for this job will have been NULL'd
|
||||||
* NULL, then we need to tell everyone else to die
|
* in the array if the job was solely local. If it isn't
|
||||||
*/
|
* NULL, then we need to tell everyone else to die
|
||||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
*/
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||||
}
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
break;
|
}
|
||||||
|
break;
|
||||||
|
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
||||||
|
/* order all local procs for this job to be killed */
|
||||||
|
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||||
|
check_job_complete(jdata); /* set the local proc states */
|
||||||
|
/* the job object for this job will have been NULL'd
|
||||||
|
* in the array if the job was solely local. If it isn't
|
||||||
|
* NULL, then we need to tell everyone else to die
|
||||||
|
*/
|
||||||
|
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||||
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -294,135 +295,146 @@ static int update_state(orte_jobid_t job,
|
|||||||
|
|
||||||
/* update is for a specific proc */
|
/* update is for a specific proc */
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case ORTE_PROC_STATE_ABORTED:
|
case ORTE_PROC_STATE_ABORTED:
|
||||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||||
if (jdata->enable_recovery) {
|
if (jdata->enable_recovery) {
|
||||||
/* is this a local proc */
|
/* is this a local proc */
|
||||||
if (NULL != (child = proc_is_local(proc))) {
|
if (NULL != (child = proc_is_local(proc))) {
|
||||||
/* local proc - see if it has reached its local restart limit */
|
/* local proc - see if it has reached its local restart limit */
|
||||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
|
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
|
||||||
if (child->restarts < app->max_local_restarts) {
|
if (child->restarts < app->max_local_restarts) {
|
||||||
child->restarts++;
|
child->restarts++;
|
||||||
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
|
||||||
/* let it fall thru to abort */
|
|
||||||
} else {
|
|
||||||
/* see if we can relocate it somewhere else */
|
|
||||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
/* let it fall thru to abort */
|
|
||||||
}
|
}
|
||||||
|
/* let it fall thru to abort */
|
||||||
} else {
|
} else {
|
||||||
/* this is a remote process - see if we can relocate it */
|
/* see if we can relocate it somewhere else */
|
||||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
|
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
/* guess not - let it fall thru to abort */
|
/* let it fall thru to abort */
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
/* this is a remote process - see if we can relocate it */
|
||||||
|
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
/* guess not - let it fall thru to abort */
|
||||||
}
|
}
|
||||||
update_proc(jdata, proc, state, pid, exit_code);
|
}
|
||||||
check_job_complete(jdata); /* need to set the job state */
|
update_proc(jdata, proc, state, pid, exit_code);
|
||||||
/* the job object for this job will have been NULL'd
|
check_job_complete(jdata); /* need to set the job state */
|
||||||
* in the array if the job was solely local. If it isn't
|
/* the job object for this job will have been NULL'd
|
||||||
* NULL, then we need to tell everyone else to die
|
* in the array if the job was solely local. If it isn't
|
||||||
*/
|
* NULL, then we need to tell everyone else to die
|
||||||
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
*/
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||||
}
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
break;
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||||
update_proc(jdata, proc, state, pid, exit_code);
|
update_proc(jdata, proc, state, pid, exit_code);
|
||||||
check_job_complete(jdata);
|
check_job_complete(jdata);
|
||||||
/* the job object for this job will have been NULL'd
|
/* the job object for this job will have been NULL'd
|
||||||
* in the array if the job was solely local. If it isn't
|
* in the array if the job was solely local. If it isn't
|
||||||
* NULL, then we need to tell everyone else to die
|
* NULL, then we need to tell everyone else to die
|
||||||
*/
|
*/
|
||||||
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_REGISTERED:
|
case ORTE_PROC_STATE_REGISTERED:
|
||||||
case ORTE_PROC_STATE_RUNNING:
|
case ORTE_PROC_STATE_RUNNING:
|
||||||
update_proc(jdata, proc, state, pid, exit_code);
|
update_proc(jdata, proc, state, pid, exit_code);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_LAUNCHED:
|
case ORTE_PROC_STATE_LAUNCHED:
|
||||||
/* record the pid for this child */
|
/* record the pid for this child */
|
||||||
update_proc(jdata, proc, state, pid, exit_code);
|
update_proc(jdata, proc, state, pid, exit_code);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_TERMINATED:
|
case ORTE_PROC_STATE_TERMINATED:
|
||||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
||||||
update_proc(jdata, proc, state, pid, exit_code);
|
update_proc(jdata, proc, state, pid, exit_code);
|
||||||
check_job_complete(jdata);
|
check_job_complete(jdata);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||||
update_proc(jdata, proc, state, pid, exit_code);
|
update_proc(jdata, proc, state, pid, exit_code);
|
||||||
killprocs(proc->jobid, proc->vpid);
|
killprocs(proc->jobid, proc->vpid);
|
||||||
check_job_complete(jdata); /* need to set the job state */
|
check_job_complete(jdata); /* need to set the job state */
|
||||||
/* the job object for this job will have been NULL'd
|
/* the job object for this job will have been NULL'd
|
||||||
* in the array if the job was solely local. If it isn't
|
* in the array if the job was solely local. If it isn't
|
||||||
* NULL, then we need to tell everyone else to die
|
* NULL, then we need to tell everyone else to die
|
||||||
*/
|
*/
|
||||||
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||||
hnp_abort(jdata->jobid, exit_code);
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_COMM_FAILED:
|
case ORTE_PROC_STATE_COMM_FAILED:
|
||||||
/* is this to a daemon? */
|
/* is this to a daemon? */
|
||||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||||
/* if we have ordered orteds to terminate, ignore this */
|
/* if we have ordered orteds to terminate, ignore this */
|
||||||
if (orte_orteds_term_ordered) {
|
if (orte_orteds_term_ordered) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* if this is my own connection, ignore it */
|
/* if this is my own connection, ignore it */
|
||||||
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (orte_enable_recovery) {
|
if (orte_enable_recovery) {
|
||||||
/* relocate its processes */
|
/* relocate its processes */
|
||||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) {
|
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) {
|
||||||
/* kill all local procs */
|
|
||||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
|
||||||
/* kill all jobs */
|
|
||||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
|
|
||||||
/* kill all local procs */
|
/* kill all local procs */
|
||||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||||
/* kill all jobs */
|
/* kill all jobs */
|
||||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* delete the route */
|
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
|
||||||
orte_routed.delete_route(proc);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
}
|
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
|
||||||
break;
|
ORTE_VPID_PRINT(proc->vpid), "Unknown");
|
||||||
|
} else {
|
||||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
|
||||||
/* heartbeats are only from daemons */
|
ORTE_VPID_PRINT(proc->vpid),
|
||||||
if (orte_enable_recovery) {
|
(NULL == pdat->node) ? "Unknown" :
|
||||||
/* relocate its processes */
|
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
|
||||||
} else {
|
}
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
|
||||||
|
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
|
||||||
/* kill all local procs */
|
/* kill all local procs */
|
||||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||||
/* kill all jobs */
|
/* kill all jobs */
|
||||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||||
return ORTE_ERR_UNRECOVERABLE;
|
|
||||||
}
|
}
|
||||||
break;
|
} else {
|
||||||
|
/* delete the route */
|
||||||
|
orte_routed.delete_route(proc);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||||
break;
|
/* heartbeats are only from daemons */
|
||||||
|
if (orte_enable_recovery) {
|
||||||
|
/* relocate its processes */
|
||||||
|
} else {
|
||||||
|
/* kill all local procs */
|
||||||
|
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||||
|
/* kill all jobs */
|
||||||
|
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||||
|
return ORTE_ERR_UNRECOVERABLE;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -17,3 +17,17 @@ except due to an internal ORTE error.
|
|||||||
Job state: %s
|
Job state: %s
|
||||||
|
|
||||||
This information should probably be reported to the OMPI developers.
|
This information should probably be reported to the OMPI developers.
|
||||||
|
#
|
||||||
|
[errmgr-hnp:daemon-died]
|
||||||
|
The system has lost communication with the following daemon:
|
||||||
|
|
||||||
|
Daemon: %s
|
||||||
|
Node: %s
|
||||||
|
|
||||||
|
The reason for the lost communication channel is unknown. Possible
|
||||||
|
reasons include failure of the daemon itself, failure of the
|
||||||
|
connecting fabric/switch, and loss of the host node. Please
|
||||||
|
check with your system administrator to try and determine the
|
||||||
|
source of the problem.
|
||||||
|
|
||||||
|
Your job is being terminated as a result.
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user