Merge pull request #3647 from rhc54/topic/forced
Provide better help when forced_terminate is invoked
Этот коммит содержится в:
Коммит
e45a358bf0
@ -98,3 +98,10 @@ then it could be an internal programming error that should be
|
|||||||
reported to the developers. In the meantime, a workaround may
|
reported to the developers. In the meantime, a workaround may
|
||||||
be to set the MCA param routed=direct on the command line or
|
be to set the MCA param routed=direct on the command line or
|
||||||
in your environment.
|
in your environment.
|
||||||
|
#
|
||||||
|
[simple-message]
|
||||||
|
An internal error has occurred in ORTE:
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
This is something that should be reported to the developers.
|
||||||
|
@ -64,6 +64,7 @@
|
|||||||
|
|
||||||
static int init(void);
|
static int init(void);
|
||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
|
static void hnp_abort(int error_code, char *fmt, ...);
|
||||||
|
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
static int predicted_fault(opal_list_t *proc_list,
|
||||||
opal_list_t *node_list,
|
opal_list_t *node_list,
|
||||||
@ -83,7 +84,7 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
|
|||||||
init,
|
init,
|
||||||
finalize,
|
finalize,
|
||||||
orte_errmgr_base_log,
|
orte_errmgr_base_log,
|
||||||
orte_errmgr_base_abort,
|
hnp_abort,
|
||||||
orte_errmgr_base_abort_peers,
|
orte_errmgr_base_abort_peers,
|
||||||
predicted_fault,
|
predicted_fault,
|
||||||
suggest_map_targets,
|
suggest_map_targets,
|
||||||
@ -125,6 +126,71 @@ static int finalize(void)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void wakeup(int sd, short args, void *cbdata)
|
||||||
|
{
|
||||||
|
/* nothing more we can do */
|
||||||
|
orte_quit(0, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* this function only gets called when FORCED_TERMINATE
|
||||||
|
* has been invoked, which means that there is some
|
||||||
|
* internal failure (e.g., to pack/unpack a correct value).
|
||||||
|
* We could just exit, but that doesn't result in any
|
||||||
|
* meaningful error message to the user. Likewise, just
|
||||||
|
* printing something to stdout/stderr won't necessarily
|
||||||
|
* get back to the user. Instead, we will send an error
|
||||||
|
* report to mpirun and give it a chance to order our
|
||||||
|
* termination. In order to ensure we _do_ terminate,
|
||||||
|
* we set a timer - if it fires before we receive the
|
||||||
|
* termination command, then we will exit on our own. This
|
||||||
|
* protects us in the case that the failure is in the
|
||||||
|
* messaging system itself */
|
||||||
|
static void hnp_abort(int error_code, char *fmt, ...)
|
||||||
|
{
|
||||||
|
va_list arglist;
|
||||||
|
char *outmsg = NULL;
|
||||||
|
orte_timer_t *timer;
|
||||||
|
|
||||||
|
/* ensure we exit with non-zero status */
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(error_code);
|
||||||
|
|
||||||
|
/* If there was a message, construct it */
|
||||||
|
va_start(arglist, fmt);
|
||||||
|
if (NULL != fmt) {
|
||||||
|
vasprintf(&outmsg, fmt, arglist);
|
||||||
|
}
|
||||||
|
va_end(arglist);
|
||||||
|
|
||||||
|
/* use the show-help system to get the message out */
|
||||||
|
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
|
||||||
|
|
||||||
|
/* this could have happened very early, so see if it happened
|
||||||
|
* before we started anything - if so, we can just finalize */
|
||||||
|
if (orte_never_launched) {
|
||||||
|
orte_quit(0, 0, NULL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* tell the daemons to terminate */
|
||||||
|
if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
|
||||||
|
orte_quit(0, 0, NULL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* set a timer for exiting - this also gives the message a chance
|
||||||
|
* to get out! */
|
||||||
|
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
timer->tv.tv_sec = 5;
|
||||||
|
timer->tv.tv_usec = 0;
|
||||||
|
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||||
|
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||||
|
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void job_errors(int fd, short args, void *cbdata)
|
static void job_errors(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
*/
|
*/
|
||||||
static int init(void);
|
static int init(void);
|
||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
|
static void orted_abort(int error_code, char *fmt, ...);
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
static int predicted_fault(opal_list_t *proc_list,
|
||||||
opal_list_t *node_list,
|
opal_list_t *node_list,
|
||||||
opal_list_t *suggested_map);
|
opal_list_t *suggested_map);
|
||||||
@ -78,7 +78,7 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
|
|||||||
init,
|
init,
|
||||||
finalize,
|
finalize,
|
||||||
orte_errmgr_base_log,
|
orte_errmgr_base_log,
|
||||||
orte_errmgr_base_abort,
|
orted_abort,
|
||||||
orte_errmgr_base_abort_peers,
|
orte_errmgr_base_abort_peers,
|
||||||
predicted_fault,
|
predicted_fault,
|
||||||
suggest_map_targets,
|
suggest_map_targets,
|
||||||
@ -122,6 +122,119 @@ static int finalize(void)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void wakeup(int sd, short args, void *cbdata)
|
||||||
|
{
|
||||||
|
/* nothing more we can do */
|
||||||
|
orte_quit(0, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* this function only gets called when FORCED_TERMINATE
|
||||||
|
* has been invoked, which means that there is some
|
||||||
|
* internal failure (e.g., to pack/unpack a correct value).
|
||||||
|
* We could just exit, but that doesn't result in any
|
||||||
|
* meaningful error message to the user. Likewise, just
|
||||||
|
* printing something to stdout/stderr won't necessarily
|
||||||
|
* get back to the user. Instead, we will send an error
|
||||||
|
* report to mpirun and give it a chance to order our
|
||||||
|
* termination. In order to ensure we _do_ terminate,
|
||||||
|
* we set a timer - if it fires before we receive the
|
||||||
|
* termination command, then we will exit on our own. This
|
||||||
|
* protects us in the case that the failure is in the
|
||||||
|
* messaging system itself */
|
||||||
|
static void orted_abort(int error_code, char *fmt, ...)
|
||||||
|
{
|
||||||
|
va_list arglist;
|
||||||
|
char *outmsg = NULL;
|
||||||
|
orte_plm_cmd_flag_t cmd;
|
||||||
|
opal_buffer_t *alert;
|
||||||
|
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||||
|
orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||||
|
orte_timer_t *timer;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* If there was a message, construct it */
|
||||||
|
va_start(arglist, fmt);
|
||||||
|
if (NULL != fmt) {
|
||||||
|
vasprintf(&outmsg, fmt, arglist);
|
||||||
|
}
|
||||||
|
va_end(arglist);
|
||||||
|
|
||||||
|
/* use the show-help system to get the message out */
|
||||||
|
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
|
||||||
|
|
||||||
|
/* tell the HNP we are in distress */
|
||||||
|
alert = OBJ_NEW(opal_buffer_t);
|
||||||
|
/* pack update state command */
|
||||||
|
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack the jobid */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack our vpid */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack our pid */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack our state */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* pack our exit code */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
/* flag that this job is complete so the receiver can know */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* send it */
|
||||||
|
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||||
|
ORTE_PROC_MY_HNP, alert,
|
||||||
|
ORTE_RML_TAG_PLM,
|
||||||
|
orte_rml_send_callback, NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
/* we can't communicate, so give up */
|
||||||
|
orte_quit(0, 0, NULL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
/* set a timer for exiting - this also gives the message a chance
|
||||||
|
* to get out! */
|
||||||
|
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
timer->tv.tv_sec = 5;
|
||||||
|
timer->tv.tv_usec = 0;
|
||||||
|
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||||
|
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||||
|
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static void job_errors(int fd, short args, void *cbdata)
|
static void job_errors(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
@ -259,7 +372,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
/* terminate - our routed children will see
|
/* terminate - our routed children will see
|
||||||
* us leave and automatically die
|
* us leave and automatically die
|
||||||
*/
|
*/
|
||||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
orte_quit(0, 0, NULL);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -528,7 +528,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
|||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (ORTE_PROC_STATE_RUNNING < rec->state ||
|
if ((ORTE_PROC_STATE_RUNNING < rec->state &&
|
||||||
|
ORTE_PROC_STATE_CALLED_ABORT != rec->state) ||
|
||||||
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
|
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
|
||||||
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
||||||
|
@ -48,6 +48,7 @@
|
|||||||
#include "opal/class/opal_list.h"
|
#include "opal/class/opal_list.h"
|
||||||
#include "opal/mca/event/event.h"
|
#include "opal/mca/event/event.h"
|
||||||
|
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/plm/plm_types.h"
|
#include "orte/mca/plm/plm_types.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
@ -64,42 +65,40 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
|
|||||||
/* For ease in debugging the state machine, it is STRONGLY recommended
|
/* For ease in debugging the state machine, it is STRONGLY recommended
|
||||||
* that the functions be accessed using the following macros
|
* that the functions be accessed using the following macros
|
||||||
*/
|
*/
|
||||||
#define ORTE_FORCED_TERMINATE(x) \
|
#define ORTE_FORCED_TERMINATE(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!orte_abnormal_term_ordered) { \
|
if (!orte_abnormal_term_ordered) { \
|
||||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
orte_errmgr.abort((x), "%s FORCE-TERMINATE AT %s:%d - error %s(%d)", \
|
||||||
"%s FORCE-TERMINATE AT %s:%d", \
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
ORTE_ERROR_NAME((x)), (x), \
|
||||||
__FILE__, __LINE__); \
|
__FILE__, __LINE__); \
|
||||||
ORTE_UPDATE_EXIT_STATUS(x); \
|
} \
|
||||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); \
|
|
||||||
} \
|
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
|
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
|
||||||
do { \
|
do { \
|
||||||
orte_job_t *shadow=(j); \
|
orte_job_t *shadow=(j); \
|
||||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||||
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
|
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||||
(NULL == shadow) ? "NULL" : \
|
(NULL == shadow) ? "NULL" : \
|
||||||
ORTE_JOBID_PRINT(shadow->jobid), \
|
ORTE_JOBID_PRINT(shadow->jobid), \
|
||||||
orte_job_state_to_str((s)), \
|
orte_job_state_to_str((s)), \
|
||||||
__FILE__, __LINE__); \
|
__FILE__, __LINE__); \
|
||||||
orte_state.activate_job_state(shadow, (s)); \
|
orte_state.activate_job_state(shadow, (s)); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
|
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
|
||||||
do { \
|
do { \
|
||||||
orte_process_name_t *shadow=(p); \
|
orte_process_name_t *shadow=(p); \
|
||||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||||
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
|
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||||
(NULL == shadow) ? "NULL" : \
|
(NULL == shadow) ? "NULL" : \
|
||||||
ORTE_NAME_PRINT(shadow), \
|
ORTE_NAME_PRINT(shadow), \
|
||||||
orte_proc_state_to_str((s)), \
|
orte_proc_state_to_str((s)), \
|
||||||
__FILE__, __LINE__); \
|
__FILE__, __LINE__); \
|
||||||
orte_state.activate_proc_state(shadow, (s)); \
|
orte_state.activate_proc_state(shadow, (s)); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user