Print a nice error message when a daemon fails, and exit with a non-zero status
This commit was SVN r23314.
Этот коммит содержится в:
родитель
1fad51776d
Коммит
3237b9ec87
@ -124,6 +124,7 @@ static int update_state(orte_jobid_t job,
|
||||
orte_odls_child_t *child;
|
||||
int rc;
|
||||
orte_app_context_t *app;
|
||||
orte_proc_t *pdat;
|
||||
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
@ -396,6 +397,17 @@ static int update_state(orte_jobid_t job,
|
||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||
}
|
||||
} else {
|
||||
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
|
||||
ORTE_VPID_PRINT(proc->vpid), "Unknown");
|
||||
} else {
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
|
||||
ORTE_VPID_PRINT(proc->vpid),
|
||||
(NULL == pdat->node) ? "Unknown" :
|
||||
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
|
||||
}
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
|
||||
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
|
@ -17,3 +17,17 @@ except due to an internal ORTE error.
|
||||
Job state: %s
|
||||
|
||||
This information should probably be reported to the OMPI developers.
|
||||
#
|
||||
[errmgr-hnp:daemon-died]
|
||||
The system has lost communication with the following daemon:
|
||||
|
||||
Daemon: %s
|
||||
Node: %s
|
||||
|
||||
The reason for the lost communication channel is unknown. Possible
|
||||
reasons include failure of the daemon itself, failure of the
|
||||
connecting fabric/switch, and loss of the host node. Please
|
||||
check with your system administrator to try and determine the
|
||||
source of the problem.
|
||||
|
||||
Your job is being terminated as a result.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user