Instead of "forced_terminate" just quietly causing the daemon to disappear, let's at least attempt to let the user know where the problem occurred.
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
cde80bbf47
Коммит
2ab4f93f6a
@ -98,3 +98,10 @@ then it could be an internal programming error that should be
|
||||
reported to the developers. In the meantime, a workaround may
|
||||
be to set the MCA param routed=direct on the command line or
|
||||
in your environment.
|
||||
#
|
||||
[simple-message]
|
||||
An internal error has occurred in ORTE:
|
||||
|
||||
%s
|
||||
|
||||
This is something that should be reported to the developers.
|
||||
|
@ -64,6 +64,7 @@
|
||||
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
static void hnp_abort(int error_code, char *fmt, ...);
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
@ -83,7 +84,7 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
hnp_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
@ -125,6 +126,71 @@ static int finalize(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void wakeup(int sd, short args, void *cbdata)
|
||||
{
|
||||
/* nothing more we can do */
|
||||
orte_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
/* this function only gets called when FORCED_TERMINATE
|
||||
* has been invoked, which means that there is some
|
||||
* internal failure (e.g., to pack/unpack a correct value).
|
||||
* We could just exit, but that doesn't result in any
|
||||
* meaningful error message to the user. Likewise, just
|
||||
* printing something to stdout/stderr won't necessarily
|
||||
* get back to the user. Instead, we will send an error
|
||||
* report to mpirun and give it a chance to order our
|
||||
* termination. In order to ensure we _do_ terminate,
|
||||
* we set a timer - if it fires before we receive the
|
||||
* termination command, then we will exit on our own. This
|
||||
* protects us in the case that the failure is in the
|
||||
* messaging system itself */
|
||||
static void hnp_abort(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
char *outmsg = NULL;
|
||||
orte_timer_t *timer;
|
||||
|
||||
/* ensure we exit with non-zero status */
|
||||
ORTE_UPDATE_EXIT_STATUS(error_code);
|
||||
|
||||
/* If there was a message, construct it */
|
||||
va_start(arglist, fmt);
|
||||
if (NULL != fmt) {
|
||||
vasprintf(&outmsg, fmt, arglist);
|
||||
}
|
||||
va_end(arglist);
|
||||
|
||||
/* use the show-help system to get the message out */
|
||||
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
|
||||
|
||||
/* this could have happened very early, so see if it happened
|
||||
* before we started anything - if so, we can just finalize */
|
||||
if (orte_never_launched) {
|
||||
orte_quit(0, 0, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
/* tell the daemons to terminate */
|
||||
if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
|
||||
orte_quit(0, 0, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
/* set a timer for exiting - this also gives the message a chance
|
||||
* to get out! */
|
||||
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return;
|
||||
}
|
||||
timer->tv.tv_sec = 5;
|
||||
timer->tv.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||
}
|
||||
|
||||
|
||||
static void job_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
@ -59,7 +59,7 @@
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static void orted_abort(int error_code, char *fmt, ...);
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
@ -78,7 +78,7 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orted_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
@ -122,6 +122,119 @@ static int finalize(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void wakeup(int sd, short args, void *cbdata)
|
||||
{
|
||||
/* nothing more we can do */
|
||||
orte_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
/* this function only gets called when FORCED_TERMINATE
|
||||
* has been invoked, which means that there is some
|
||||
* internal failure (e.g., to pack/unpack a correct value).
|
||||
* We could just exit, but that doesn't result in any
|
||||
* meaningful error message to the user. Likewise, just
|
||||
* printing something to stdout/stderr won't necessarily
|
||||
* get back to the user. Instead, we will send an error
|
||||
* report to mpirun and give it a chance to order our
|
||||
* termination. In order to ensure we _do_ terminate,
|
||||
* we set a timer - if it fires before we receive the
|
||||
* termination command, then we will exit on our own. This
|
||||
* protects us in the case that the failure is in the
|
||||
* messaging system itself */
|
||||
static void orted_abort(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
char *outmsg = NULL;
|
||||
orte_plm_cmd_flag_t cmd;
|
||||
opal_buffer_t *alert;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
orte_timer_t *timer;
|
||||
int rc;
|
||||
|
||||
/* If there was a message, construct it */
|
||||
va_start(arglist, fmt);
|
||||
if (NULL != fmt) {
|
||||
vasprintf(&outmsg, fmt, arglist);
|
||||
}
|
||||
va_end(arglist);
|
||||
|
||||
/* use the show-help system to get the message out */
|
||||
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
|
||||
|
||||
/* tell the HNP we are in distress */
|
||||
alert = OBJ_NEW(opal_buffer_t);
|
||||
/* pack update state command */
|
||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack our vpid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack our pid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack our state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack our exit code */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
/* flag that this job is complete so the receiver can know */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* send it */
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
ORTE_PROC_MY_HNP, alert,
|
||||
ORTE_RML_TAG_PLM,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
/* we can't communicate, so give up */
|
||||
orte_quit(0, 0, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* set a timer for exiting - this also gives the message a chance
|
||||
* to get out! */
|
||||
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return;
|
||||
}
|
||||
timer->tv.tv_sec = 5;
|
||||
timer->tv.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||
|
||||
}
|
||||
|
||||
static void job_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
@ -259,7 +372,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* terminate - our routed children will see
|
||||
* us leave and automatically die
|
||||
*/
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
orte_quit(0, 0, NULL);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
@ -528,7 +528,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
OBJ_RELEASE(item);
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_RUNNING < rec->state ||
|
||||
if ((ORTE_PROC_STATE_RUNNING < rec->state &&
|
||||
ORTE_PROC_STATE_CALLED_ABORT != rec->state) ||
|
||||
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
|
||||
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
||||
|
@ -48,6 +48,7 @@
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -64,42 +65,40 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
|
||||
/* For ease in debugging the state machine, it is STRONGLY recommended
|
||||
* that the functions be accessed using the following macros
|
||||
*/
|
||||
#define ORTE_FORCED_TERMINATE(x) \
|
||||
do { \
|
||||
if (!orte_abnormal_term_ordered) { \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
"%s FORCE-TERMINATE AT %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__); \
|
||||
ORTE_UPDATE_EXIT_STATUS(x); \
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); \
|
||||
} \
|
||||
#define ORTE_FORCED_TERMINATE(x) \
|
||||
do { \
|
||||
if (!orte_abnormal_term_ordered) { \
|
||||
orte_errmgr.abort((x), "%s FORCE-TERMINATE AT %s:%d - error %s(%d)", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
ORTE_ERROR_NAME((x)), (x), \
|
||||
__FILE__, __LINE__); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
|
||||
do { \
|
||||
orte_job_t *shadow=(j); \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
(NULL == shadow) ? "NULL" : \
|
||||
ORTE_JOBID_PRINT(shadow->jobid), \
|
||||
orte_job_state_to_str((s)), \
|
||||
__FILE__, __LINE__); \
|
||||
orte_state.activate_job_state(shadow, (s)); \
|
||||
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
|
||||
do { \
|
||||
orte_job_t *shadow=(j); \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
(NULL == shadow) ? "NULL" : \
|
||||
ORTE_JOBID_PRINT(shadow->jobid), \
|
||||
orte_job_state_to_str((s)), \
|
||||
__FILE__, __LINE__); \
|
||||
orte_state.activate_job_state(shadow, (s)); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
|
||||
do { \
|
||||
orte_process_name_t *shadow=(p); \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
(NULL == shadow) ? "NULL" : \
|
||||
ORTE_NAME_PRINT(shadow), \
|
||||
orte_proc_state_to_str((s)), \
|
||||
__FILE__, __LINE__); \
|
||||
orte_state.activate_proc_state(shadow, (s)); \
|
||||
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
|
||||
do { \
|
||||
orte_process_name_t *shadow=(p); \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
(NULL == shadow) ? "NULL" : \
|
||||
ORTE_NAME_PRINT(shadow), \
|
||||
orte_proc_state_to_str((s)), \
|
||||
__FILE__, __LINE__); \
|
||||
orte_state.activate_proc_state(shadow, (s)); \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user