From 2ab4f93f6a799979820307869633832f83c63415 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 2 Jun 2017 08:28:16 -0700 Subject: [PATCH] Instead of "forced_terminate" just quietly causing the daemon to disappear, let's at least attempt to let the user know where the problem occurred. Signed-off-by: Ralph Castain --- orte/mca/errmgr/base/help-errmgr-base.txt | 7 ++ .../errmgr/default_hnp/errmgr_default_hnp.c | 68 +++++++++- .../default_orted/errmgr_default_orted.c | 119 +++++++++++++++++- orte/mca/grpcomm/direct/grpcomm_direct.c | 3 +- orte/mca/state/state.h | 63 +++++----- 5 files changed, 223 insertions(+), 37 deletions(-) diff --git a/orte/mca/errmgr/base/help-errmgr-base.txt b/orte/mca/errmgr/base/help-errmgr-base.txt index 1470bd13a9..07a9f71909 100644 --- a/orte/mca/errmgr/base/help-errmgr-base.txt +++ b/orte/mca/errmgr/base/help-errmgr-base.txt @@ -98,3 +98,10 @@ then it could be an internal programming error that should be reported to the developers. In the meantime, a workaround may be to set the MCA param routed=direct on the command line or in your environment. +# +[simple-message] +An internal error has occurred in ORTE: + +%s + +This is something that should be reported to the developers. diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index 59c8e87a0f..3391306eab 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -64,6 +64,7 @@ static int init(void); static int finalize(void); +static void hnp_abort(int error_code, char *fmt, ...); static int predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, @@ -83,7 +84,7 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = { init, finalize, orte_errmgr_base_log, - orte_errmgr_base_abort, + hnp_abort, orte_errmgr_base_abort_peers, predicted_fault, suggest_map_targets, @@ -125,6 +126,71 @@ static int finalize(void) return ORTE_SUCCESS; } +static void wakeup(int sd, short args, void *cbdata) +{ + /* nothing more we can do */ + orte_quit(0, 0, NULL); +} + +/* this function only gets called when FORCED_TERMINATE + * has been invoked, which means that there is some + * internal failure (e.g., to pack/unpack a correct value). + * We could just exit, but that doesn't result in any + * meaningful error message to the user. Likewise, just + * printing something to stdout/stderr won't necessarily + * get back to the user. Instead, we will send an error + * report to mpirun and give it a chance to order our + * termination. In order to ensure we _do_ terminate, + * we set a timer - if it fires before we receive the + * termination command, then we will exit on our own. This + * protects us in the case that the failure is in the + * messaging system itself */ +static void hnp_abort(int error_code, char *fmt, ...) +{ + va_list arglist; + char *outmsg = NULL; + orte_timer_t *timer; + + /* ensure we exit with non-zero status */ + ORTE_UPDATE_EXIT_STATUS(error_code); + + /* If there was a message, construct it */ + va_start(arglist, fmt); + if (NULL != fmt) { + vasprintf(&outmsg, fmt, arglist); + } + va_end(arglist); + + /* use the show-help system to get the message out */ + orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg); + + /* this could have happened very early, so see if it happened + * before we started anything - if so, we can just finalize */ + if (orte_never_launched) { + orte_quit(0, 0, NULL); + return; + } + + /* tell the daemons to terminate */ + if (ORTE_SUCCESS != orte_plm.terminate_orteds()) { + orte_quit(0, 0, NULL); + return; + } + + /* set a timer for exiting - this also gives the message a chance + * to get out! */ + if (NULL == (timer = OBJ_NEW(orte_timer_t))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return; + } + timer->tv.tv_sec = 5; + timer->tv.tv_usec = 0; + opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL); + opal_event_set_priority(timer->ev, ORTE_ERROR_PRI); + opal_event_evtimer_add(timer->ev, &timer->tv); +} + + static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index a58733020e..ce90fdd598 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -59,7 +59,7 @@ */ static int init(void); static int finalize(void); - +static void orted_abort(int error_code, char *fmt, ...); static int predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map); @@ -78,7 +78,7 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = { init, finalize, orte_errmgr_base_log, - orte_errmgr_base_abort, + orted_abort, orte_errmgr_base_abort_peers, predicted_fault, suggest_map_targets, @@ -122,6 +122,119 @@ static int finalize(void) return ORTE_SUCCESS; } +static void wakeup(int sd, short args, void *cbdata) +{ + /* nothing more we can do */ + orte_quit(0, 0, NULL); +} + +/* this function only gets called when FORCED_TERMINATE + * has been invoked, which means that there is some + * internal failure (e.g., to pack/unpack a correct value). + * We could just exit, but that doesn't result in any + * meaningful error message to the user. Likewise, just + * printing something to stdout/stderr won't necessarily + * get back to the user. Instead, we will send an error + * report to mpirun and give it a chance to order our + * termination. In order to ensure we _do_ terminate, + * we set a timer - if it fires before we receive the + * termination command, then we will exit on our own. This + * protects us in the case that the failure is in the + * messaging system itself */ +static void orted_abort(int error_code, char *fmt, ...) +{ + va_list arglist; + char *outmsg = NULL; + orte_plm_cmd_flag_t cmd; + opal_buffer_t *alert; + orte_vpid_t null=ORTE_VPID_INVALID; + orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT; + orte_timer_t *timer; + int rc; + + /* If there was a message, construct it */ + va_start(arglist, fmt); + if (NULL != fmt) { + vasprintf(&outmsg, fmt, arglist); + } + va_end(arglist); + + /* use the show-help system to get the message out */ + orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg); + + /* tell the HNP we are in distress */ + alert = OBJ_NEW(opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack our vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack our pid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack our state */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack our exit code */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + + /* send it */ + if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, + ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + /* we can't communicate, so give up */ + orte_quit(0, 0, NULL); + return; + } + + cleanup: + /* set a timer for exiting - this also gives the message a chance + * to get out! */ + if (NULL == (timer = OBJ_NEW(orte_timer_t))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return; + } + timer->tv.tv_sec = 5; + timer->tv.tv_usec = 0; + opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL); + opal_event_set_priority(timer->ev, ORTE_ERROR_PRI); + opal_event_evtimer_add(timer->ev, &timer->tv); + +} + static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; @@ -259,7 +372,7 @@ static void proc_errors(int fd, short args, void *cbdata) /* terminate - our routed children will see * us leave and automatically die */ - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_quit(0, 0, NULL); goto cleanup; } diff --git a/orte/mca/grpcomm/direct/grpcomm_direct.c b/orte/mca/grpcomm/direct/grpcomm_direct.c index 0621d5db12..818c81ce61 100644 --- a/orte/mca/grpcomm/direct/grpcomm_direct.c +++ b/orte/mca/grpcomm/direct/grpcomm_direct.c @@ -528,7 +528,8 @@ static void xcast_recv(int status, orte_process_name_t* sender, OBJ_RELEASE(item); continue; } - if (ORTE_PROC_STATE_RUNNING < rec->state || + if ((ORTE_PROC_STATE_RUNNING < rec->state && + ORTE_PROC_STATE_CALLED_ABORT != rec->state) || !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); diff --git a/orte/mca/state/state.h b/orte/mca/state/state.h index f1f4ece061..ee3ec8378c 100644 --- a/orte/mca/state/state.h +++ b/orte/mca/state/state.h @@ -48,6 +48,7 @@ #include "opal/class/opal_list.h" #include "opal/mca/event/event.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/plm/plm_types.h" #include "orte/runtime/orte_globals.h" @@ -64,42 +65,40 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework; /* For ease in debugging the state machine, it is STRONGLY recommended * that the functions be accessed using the following macros */ -#define ORTE_FORCED_TERMINATE(x) \ - do { \ - if (!orte_abnormal_term_ordered) { \ - opal_output_verbose(1, orte_state_base_framework.framework_output, \ - "%s FORCE-TERMINATE AT %s:%d", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__); \ - ORTE_UPDATE_EXIT_STATUS(x); \ - ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); \ - } \ +#define ORTE_FORCED_TERMINATE(x) \ + do { \ + if (!orte_abnormal_term_ordered) { \ + orte_errmgr.abort((x), "%s FORCE-TERMINATE AT %s:%d - error %s(%d)", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + ORTE_ERROR_NAME((x)), (x), \ + __FILE__, __LINE__); \ + } \ } while(0); -#define ORTE_ACTIVATE_JOB_STATE(j, s) \ - do { \ - orte_job_t *shadow=(j); \ - opal_output_verbose(1, orte_state_base_framework.framework_output, \ - "%s ACTIVATE JOB %s STATE %s AT %s:%d", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - (NULL == shadow) ? "NULL" : \ - ORTE_JOBID_PRINT(shadow->jobid), \ - orte_job_state_to_str((s)), \ - __FILE__, __LINE__); \ - orte_state.activate_job_state(shadow, (s)); \ +#define ORTE_ACTIVATE_JOB_STATE(j, s) \ + do { \ + orte_job_t *shadow=(j); \ + opal_output_verbose(1, orte_state_base_framework.framework_output, \ + "%s ACTIVATE JOB %s STATE %s AT %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + (NULL == shadow) ? "NULL" : \ + ORTE_JOBID_PRINT(shadow->jobid), \ + orte_job_state_to_str((s)), \ + __FILE__, __LINE__); \ + orte_state.activate_job_state(shadow, (s)); \ } while(0); -#define ORTE_ACTIVATE_PROC_STATE(p, s) \ - do { \ - orte_process_name_t *shadow=(p); \ - opal_output_verbose(1, orte_state_base_framework.framework_output, \ - "%s ACTIVATE PROC %s STATE %s AT %s:%d", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - (NULL == shadow) ? "NULL" : \ - ORTE_NAME_PRINT(shadow), \ - orte_proc_state_to_str((s)), \ - __FILE__, __LINE__); \ - orte_state.activate_proc_state(shadow, (s)); \ +#define ORTE_ACTIVATE_PROC_STATE(p, s) \ + do { \ + orte_process_name_t *shadow=(p); \ + opal_output_verbose(1, orte_state_base_framework.framework_output, \ + "%s ACTIVATE PROC %s STATE %s AT %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + (NULL == shadow) ? "NULL" : \ + ORTE_NAME_PRINT(shadow), \ + orte_proc_state_to_str((s)), \ + __FILE__, __LINE__); \ + orte_state.activate_proc_state(shadow, (s)); \ } while(0); /**