1
1

Instead of "forced_terminate" just quietly causing the daemon to disappear, let's at least attempt to let the user know where the problem occurred.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-06-02 08:28:16 -07:00
родитель cde80bbf47
Коммит 2ab4f93f6a
5 изменённых файлов: 223 добавлений и 37 удалений

Просмотреть файл

@ -98,3 +98,10 @@ then it could be an internal programming error that should be
reported to the developers. In the meantime, a workaround may
be to set the MCA param routed=direct on the command line or
in your environment.
#
[simple-message]
An internal error has occurred in ORTE:
%s
This is something that should be reported to the developers.

Просмотреть файл

@ -64,6 +64,7 @@
static int init(void);
static int finalize(void);
static void hnp_abort(int error_code, char *fmt, ...);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
@ -83,7 +84,7 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
hnp_abort,
orte_errmgr_base_abort_peers,
predicted_fault,
suggest_map_targets,
@ -125,6 +126,71 @@ static int finalize(void)
return ORTE_SUCCESS;
}
static void wakeup(int sd, short args, void *cbdata)
{
/* nothing more we can do */
orte_quit(0, 0, NULL);
}
/* this function only gets called when FORCED_TERMINATE
* has been invoked, which means that there is some
* internal failure (e.g., to pack/unpack a correct value).
* We could just exit, but that doesn't result in any
* meaningful error message to the user. Likewise, just
* printing something to stdout/stderr won't necessarily
* get back to the user. Instead, we will send an error
* report to mpirun and give it a chance to order our
* termination. In order to ensure we _do_ terminate,
* we set a timer - if it fires before we receive the
* termination command, then we will exit on our own. This
* protects us in the case that the failure is in the
* messaging system itself */
static void hnp_abort(int error_code, char *fmt, ...)
{
va_list arglist;
char *outmsg = NULL;
orte_timer_t *timer;
/* ensure we exit with non-zero status */
ORTE_UPDATE_EXIT_STATUS(error_code);
/* If there was a message, construct it */
va_start(arglist, fmt);
if (NULL != fmt) {
vasprintf(&outmsg, fmt, arglist);
}
va_end(arglist);
/* use the show-help system to get the message out */
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
/* this could have happened very early, so see if it happened
* before we started anything - if so, we can just finalize */
if (orte_never_launched) {
orte_quit(0, 0, NULL);
return;
}
/* tell the daemons to terminate */
if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
orte_quit(0, 0, NULL);
return;
}
/* set a timer for exiting - this also gives the message a chance
* to get out! */
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
timer->tv.tv_sec = 5;
timer->tv.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
opal_event_evtimer_add(timer->ev, &timer->tv);
}
static void job_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

Просмотреть файл

@ -59,7 +59,7 @@
*/
static int init(void);
static int finalize(void);
static void orted_abort(int error_code, char *fmt, ...);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
@ -78,7 +78,7 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orted_abort,
orte_errmgr_base_abort_peers,
predicted_fault,
suggest_map_targets,
@ -122,6 +122,119 @@ static int finalize(void)
return ORTE_SUCCESS;
}
static void wakeup(int sd, short args, void *cbdata)
{
/* nothing more we can do */
orte_quit(0, 0, NULL);
}
/* this function only gets called when FORCED_TERMINATE
* has been invoked, which means that there is some
* internal failure (e.g., to pack/unpack a correct value).
* We could just exit, but that doesn't result in any
* meaningful error message to the user. Likewise, just
* printing something to stdout/stderr won't necessarily
* get back to the user. Instead, we will send an error
* report to mpirun and give it a chance to order our
* termination. In order to ensure we _do_ terminate,
* we set a timer - if it fires before we receive the
* termination command, then we will exit on our own. This
* protects us in the case that the failure is in the
* messaging system itself */
static void orted_abort(int error_code, char *fmt, ...)
{
va_list arglist;
char *outmsg = NULL;
orte_plm_cmd_flag_t cmd;
opal_buffer_t *alert;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT;
orte_timer_t *timer;
int rc;
/* If there was a message, construct it */
va_start(arglist, fmt);
if (NULL != fmt) {
vasprintf(&outmsg, fmt, arglist);
}
va_end(arglist);
/* use the show-help system to get the message out */
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
/* tell the HNP we are in distress */
alert = OBJ_NEW(opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* pack our vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* pack our pid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* pack our state */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* pack our exit code */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
goto cleanup;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
/* we can't communicate, so give up */
orte_quit(0, 0, NULL);
return;
}
cleanup:
/* set a timer for exiting - this also gives the message a chance
* to get out! */
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
timer->tv.tv_sec = 5;
timer->tv.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
opal_event_evtimer_add(timer->ev, &timer->tv);
}
static void job_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@ -259,7 +372,7 @@ static void proc_errors(int fd, short args, void *cbdata)
/* terminate - our routed children will see
* us leave and automatically die
*/
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_quit(0, 0, NULL);
goto cleanup;
}

Просмотреть файл

@ -528,7 +528,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(item);
continue;
}
if (ORTE_PROC_STATE_RUNNING < rec->state ||
if ((ORTE_PROC_STATE_RUNNING < rec->state &&
ORTE_PROC_STATE_CALLED_ABORT != rec->state) ||
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));

Просмотреть файл

@ -48,6 +48,7 @@
#include "opal/class/opal_list.h"
#include "opal/mca/event/event.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/runtime/orte_globals.h"
@ -64,42 +65,40 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
/* For ease in debugging the state machine, it is STRONGLY recommended
* that the functions be accessed using the following macros
*/
#define ORTE_FORCED_TERMINATE(x) \
do { \
if (!orte_abnormal_term_ordered) { \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s FORCE-TERMINATE AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__); \
ORTE_UPDATE_EXIT_STATUS(x); \
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); \
} \
#define ORTE_FORCED_TERMINATE(x) \
do { \
if (!orte_abnormal_term_ordered) { \
orte_errmgr.abort((x), "%s FORCE-TERMINATE AT %s:%d - error %s(%d)", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
ORTE_ERROR_NAME((x)), (x), \
__FILE__, __LINE__); \
} \
} while(0);
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
do { \
orte_job_t *shadow=(j); \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
(NULL == shadow) ? "NULL" : \
ORTE_JOBID_PRINT(shadow->jobid), \
orte_job_state_to_str((s)), \
__FILE__, __LINE__); \
orte_state.activate_job_state(shadow, (s)); \
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
do { \
orte_job_t *shadow=(j); \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
(NULL == shadow) ? "NULL" : \
ORTE_JOBID_PRINT(shadow->jobid), \
orte_job_state_to_str((s)), \
__FILE__, __LINE__); \
orte_state.activate_job_state(shadow, (s)); \
} while(0);
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
do { \
orte_process_name_t *shadow=(p); \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
(NULL == shadow) ? "NULL" : \
ORTE_NAME_PRINT(shadow), \
orte_proc_state_to_str((s)), \
__FILE__, __LINE__); \
orte_state.activate_proc_state(shadow, (s)); \
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
do { \
orte_process_name_t *shadow=(p); \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
(NULL == shadow) ? "NULL" : \
ORTE_NAME_PRINT(shadow), \
orte_proc_state_to_str((s)), \
__FILE__, __LINE__); \
orte_state.activate_proc_state(shadow, (s)); \
} while(0);
/**