Merge pull request #1423 from rhc54/topic/suicide
Fix registration of error handlers thru the pmix120 component.
Этот коммит содержится в:
Коммит
d38e2e6655
opal/mca/pmix
orte
mca
errmgr/default_app
ess
odls/default
plm/base
orted
tools/orterun
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
|
||||
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
|
||||
opal_pmix_pdata_t *pdat,
|
||||
int timeout);
|
||||
|
||||
OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
|
||||
|
||||
typedef struct {
|
||||
opal_event_base_t *evbase;
|
||||
} opal_pmix_base_t;
|
||||
|
||||
extern opal_pmix_base_t opal_pmix_base;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -38,6 +38,11 @@
|
||||
|
||||
#define OPAL_PMI_PAD 10
|
||||
|
||||
void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
|
||||
{
|
||||
opal_pmix_base.evbase = evbase;
|
||||
}
|
||||
|
||||
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
|
||||
******** DO NOT NATIVELY SUPPORT IT
|
||||
********/
|
||||
|
@ -33,9 +33,9 @@
|
||||
https://github.com/open-mpi/ompi/issues/375 for details. */
|
||||
opal_pmix_base_module_t opal_pmix = { 0 };
|
||||
bool opal_pmix_collect_all_data = true;
|
||||
bool opal_pmix_base_allow_delayed_server = false;
|
||||
int opal_pmix_verbose_output = -1;
|
||||
bool opal_pmix_base_async_modex = false;
|
||||
opal_pmix_base_t opal_pmix_base = {0};
|
||||
|
||||
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
|
||||
{
|
||||
|
@ -212,9 +212,9 @@ static void reg_thread(int sd, short args, void *cbdata)
|
||||
opal_pmix120_etracker_t *trk;
|
||||
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s register complete with status %d",
|
||||
"%s register complete with status %d ref %d",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
cd->status);
|
||||
cd->status, cd->errhandler_ref);
|
||||
|
||||
/* convert the status */
|
||||
rc = pmix120_convert_rc(cd->status);
|
||||
@ -240,9 +240,10 @@ static void reg_cbfunc(pmix_status_t status,
|
||||
void *cbdata)
|
||||
{
|
||||
pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata;
|
||||
|
||||
cd->status = status;
|
||||
cd->errhandler_ref = errhandler_ref;
|
||||
opal_event_set(opal_sync_event_base, &cd->ev,
|
||||
opal_event_set(opal_pmix_base.evbase, &cd->ev,
|
||||
-1, OPAL_EV_WRITE, reg_thread, cd);
|
||||
opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI);
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/errhandler/opal_errhandler.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -71,6 +72,33 @@
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_app: errhandler called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* push it into our event base */
|
||||
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
|
||||
}
|
||||
|
||||
static int myerrhandle = -1;
|
||||
|
||||
static void register_cbfunc(int status, int errhndler, void *cbdata)
|
||||
{
|
||||
myerrhandle = errhndler;
|
||||
}
|
||||
|
||||
static void notify_cbfunc(int status,
|
||||
opal_list_t *procs,
|
||||
opal_list_t *info,
|
||||
opal_pmix_release_cbfunc_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
if (NULL != cbfunc) {
|
||||
cbfunc(cbdata);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_app: pmix errhandler called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* push it into our event base */
|
||||
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
|
||||
}
|
||||
@ -86,13 +114,16 @@ static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
|
||||
/* register an errhandler */
|
||||
opal_register_errhandler(pmix_error, NULL);
|
||||
|
||||
/* tie the default PMIx errhandler back to us */
|
||||
opal_pmix.register_errhandler(NULL, notify_cbfunc, register_cbfunc, NULL);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
opal_deregister_errhandler();
|
||||
|
||||
opal_pmix.deregister_errhandler(myerrhandle, NULL, NULL);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
@ -46,7 +46,6 @@
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/dfs/base/base.h"
|
||||
@ -70,8 +69,6 @@
|
||||
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
|
||||
static bool progress_thread_running = false;
|
||||
|
||||
int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
{
|
||||
int ret;
|
||||
@ -109,10 +106,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
opal_proc_local_set(&orte_process_info.super);
|
||||
}
|
||||
|
||||
/* get an async event base - we use the opal_async one so
|
||||
* we don't startup extra threads if not needed */
|
||||
orte_event_base = opal_progress_thread_init(NULL);
|
||||
progress_thread_running = true;
|
||||
/* open and setup the state machine */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -235,12 +228,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
error:
|
||||
if (!progress_thread_running) {
|
||||
/* can't send the help message, so ensure it
|
||||
* comes out locally
|
||||
*/
|
||||
orte_show_help_finalize();
|
||||
}
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
@ -265,12 +252,6 @@ int orte_ess_base_app_finalize(void)
|
||||
|
||||
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
||||
|
||||
/* release the event base */
|
||||
if (progress_thread_running) {
|
||||
opal_progress_thread_finalize(NULL);
|
||||
progress_thread_running = false;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -522,6 +522,8 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
error = "opal_pmix_base_select";
|
||||
goto error;
|
||||
}
|
||||
/* set the event base */
|
||||
opal_pmix_base_set_evbase(orte_event_base);
|
||||
/* setup the PMIx server */
|
||||
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
@ -630,6 +630,8 @@ static int rte_init(void)
|
||||
error = "opal_pmix_base_select";
|
||||
goto error;
|
||||
}
|
||||
/* set the event base */
|
||||
opal_pmix_base_set_evbase(orte_event_base);
|
||||
|
||||
/* setup the routed info - the selected routed component
|
||||
* will know what to do.
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -39,6 +39,7 @@
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/util/printf.h"
|
||||
@ -73,6 +74,7 @@ orte_ess_base_module_t orte_ess_pmi_module = {
|
||||
static bool added_transport_keys=false;
|
||||
static bool added_num_procs = false;
|
||||
static bool added_app_ctx = false;
|
||||
static bool progress_thread_running = false;
|
||||
|
||||
/**** MODULE FUNCTIONS ****/
|
||||
|
||||
@ -97,6 +99,11 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* get an async event base - we use the opal_async one so
|
||||
* we don't startup extra threads if not needed */
|
||||
orte_event_base = opal_progress_thread_init(NULL);
|
||||
progress_thread_running = true;
|
||||
|
||||
/* open and setup pmix */
|
||||
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -109,6 +116,8 @@ static int rte_init(void)
|
||||
error = "pmix init";
|
||||
goto error;
|
||||
}
|
||||
/* set the event base */
|
||||
opal_pmix_base_set_evbase(orte_event_base);
|
||||
/* initialize the selected module */
|
||||
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
|
||||
/* we cannot run */
|
||||
@ -394,6 +403,12 @@ static int rte_init(void)
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
if (!progress_thread_running) {
|
||||
/* can't send the help message, so ensure it
|
||||
* comes out locally
|
||||
*/
|
||||
orte_show_help_finalize();
|
||||
}
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
@ -419,18 +434,23 @@ static int rte_finalize(void)
|
||||
unsetenv("OMPI_APP_CTX_NUM_PROCS");
|
||||
}
|
||||
|
||||
/* mark us as finalized */
|
||||
if (NULL != opal_pmix.finalize) {
|
||||
opal_pmix.finalize();
|
||||
(void) mca_base_framework_close(&opal_pmix_base_framework);
|
||||
}
|
||||
|
||||
/* use the default app procedure to finish */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* mark us as finalized */
|
||||
if (NULL != opal_pmix.finalize) {
|
||||
opal_pmix.finalize();
|
||||
(void) mca_base_framework_close(&opal_pmix_base_framework);
|
||||
}
|
||||
|
||||
/* release the event base */
|
||||
if (progress_thread_running) {
|
||||
opal_progress_thread_finalize(NULL);
|
||||
progress_thread_running = false;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -415,14 +415,16 @@ static int do_child(orte_app_context_t* context,
|
||||
always outputs a nice, single message indicating what
|
||||
happened
|
||||
*/
|
||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
||||
&environ_copy))) {
|
||||
ORTE_ERROR_LOG(i);
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-default.txt",
|
||||
"iof setup failed",
|
||||
orte_process_info.nodename, context->app);
|
||||
/* Does not return */
|
||||
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
||||
&environ_copy))) {
|
||||
ORTE_ERROR_LOG(i);
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-default.txt",
|
||||
"iof setup failed",
|
||||
orte_process_info.nodename, context->app);
|
||||
/* Does not return */
|
||||
}
|
||||
}
|
||||
|
||||
/* now set any child-level controls such as binding */
|
||||
|
@ -1282,18 +1282,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
opal_argv_append(argc, argv, "1");
|
||||
}
|
||||
|
||||
/* the following two are not mca params */
|
||||
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
|
||||
opal_argv_append(argc, argv, "--debug-failure");
|
||||
asprintf(¶m, "%d", orted_debug_failure);
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
if (0 < orted_debug_failure_delay) {
|
||||
opal_argv_append(argc, argv, "--debug-failure-delay");
|
||||
asprintf(¶m, "%d", orted_debug_failure_delay);
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
/* the following is not an mca param */
|
||||
if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
||||
opal_argv_append(argc, argv, "--test-suicide");
|
||||
}
|
||||
|
||||
/* tell the orted what ESS component to use */
|
||||
|
@ -122,12 +122,11 @@ static struct {
|
||||
char* num_procs;
|
||||
int uri_pipe;
|
||||
int singleton_died_pipe;
|
||||
int fail;
|
||||
int fail_delay;
|
||||
bool abort;
|
||||
bool mapreduce;
|
||||
bool tree_spawn;
|
||||
char *hnp_topo_sig;
|
||||
bool test_suicide;
|
||||
} orted_globals;
|
||||
|
||||
/*
|
||||
@ -143,13 +142,9 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
&orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Have the orted spin until we can connect a debugger to it" },
|
||||
|
||||
{ NULL, '\0', NULL, "debug-failure", 1,
|
||||
&orted_globals.fail, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Have the specified orted fail after init for debugging purposes" },
|
||||
|
||||
{ NULL, '\0', NULL, "debug-failure-delay", 1,
|
||||
&orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Have the orted specified for failure delay for the provided number of seconds before failing" },
|
||||
{ NULL, '\0', NULL, "test-suicide", 1,
|
||||
&orted_globals.test_suicide, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suicide instead of clean abort after delay" },
|
||||
|
||||
{ "orte_debug", 'd', NULL, "debug", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
@ -246,8 +241,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
memset(&orted_globals, 0, sizeof(orted_globals));
|
||||
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
|
||||
orted_globals.singleton_died_pipe = -1;
|
||||
/* init the failure orted vpid to an invalid value */
|
||||
orted_globals.fail = ORTE_VPID_INVALID;
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
@ -428,23 +421,23 @@ int orte_daemon(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
|
||||
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
|
||||
orted_globals.abort=false;
|
||||
/* some vpid was ordered to fail. The value can be positive
|
||||
* or negative, depending upon the desired method for failure,
|
||||
* so need to check both here
|
||||
*/
|
||||
if (0 > orted_globals.fail) {
|
||||
orted_globals.fail = -1*orted_globals.fail;
|
||||
if (0 > orted_debug_failure) {
|
||||
orted_debug_failure = -1*orted_debug_failure;
|
||||
orted_globals.abort = true;
|
||||
}
|
||||
/* are we the specified vpid? */
|
||||
if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
|
||||
if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
|
||||
/* if the user specified we delay, then setup a timer
|
||||
* and have it kill us
|
||||
*/
|
||||
if (0 < orted_globals.fail_delay) {
|
||||
ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
|
||||
if (0 < orted_debug_failure_delay) {
|
||||
ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);
|
||||
|
||||
} else {
|
||||
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -901,11 +894,15 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
|
||||
/* if we were ordered to abort, do so */
|
||||
if (orted_globals.abort) {
|
||||
opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
opal_output(0, "%s is executing %s abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(orted_globals.test_suicide) ? "suicide" : "clean");
|
||||
/* do -not- call finalize as this will send a message to the HNP
|
||||
* indicating clean termination! Instead, just kill our
|
||||
* local procs, forcibly cleanup the local session_dir tree, and abort
|
||||
*/
|
||||
if (orted_globals.test_suicide) {
|
||||
exit(1);
|
||||
}
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
abort();
|
||||
|
@ -1066,6 +1066,14 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
/* check for suicide test directives */
|
||||
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
|
||||
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
||||
/* don't forward IO from this process so we can
|
||||
* see any debug after daemon termination */
|
||||
ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT);
|
||||
}
|
||||
|
||||
/* check for a job timeout specification, to be provided in seconds
|
||||
* as that is what MPICH used
|
||||
*/
|
||||
@ -2862,6 +2870,11 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
|
||||
orte_show_help("help-orterun.txt", "orterun:timeout",
|
||||
true, (NULL == tm) ? "NULL" : tm);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* if we are testing HNP suicide, then just exit */
|
||||
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE")) {
|
||||
opal_output(0, "HNP exiting w/o cleanup");
|
||||
exit(1);
|
||||
}
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||
/* set the global abnormal exit flag */
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user