1
1

Merge pull request from rhc54/topic/suicide

Fix registration of error handlers thru the pmix120 component.
Этот коммит содержится в:
rhc54 2016-03-02 17:43:06 -08:00
родитель 5a85a039fa 4a55fba414
Коммит d38e2e6655
13 изменённых файлов: 125 добавлений и 71 удалений

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
opal_pmix_pdata_t *pdat,
int timeout);
OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
typedef struct {
opal_event_base_t *evbase;
} opal_pmix_base_t;
extern opal_pmix_base_t opal_pmix_base;
END_C_DECLS
#endif

Просмотреть файл

@ -38,6 +38,11 @@
#define OPAL_PMI_PAD 10
void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
{
opal_pmix_base.evbase = evbase;
}
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
******** DO NOT NATIVELY SUPPORT IT
********/

Просмотреть файл

@ -33,9 +33,9 @@
https://github.com/open-mpi/ompi/issues/375 for details. */
opal_pmix_base_module_t opal_pmix = { 0 };
bool opal_pmix_collect_all_data = true;
bool opal_pmix_base_allow_delayed_server = false;
int opal_pmix_verbose_output = -1;
bool opal_pmix_base_async_modex = false;
opal_pmix_base_t opal_pmix_base = {0};
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
{

Просмотреть файл

@ -212,9 +212,9 @@ static void reg_thread(int sd, short args, void *cbdata)
opal_pmix120_etracker_t *trk;
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s register complete with status %d",
"%s register complete with status %d ref %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
cd->status);
cd->status, cd->errhandler_ref);
/* convert the status */
rc = pmix120_convert_rc(cd->status);
@ -240,9 +240,10 @@ static void reg_cbfunc(pmix_status_t status,
void *cbdata)
{
pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata;
cd->status = status;
cd->errhandler_ref = errhandler_ref;
opal_event_set(opal_sync_event_base, &cd->ev,
opal_event_set(opal_pmix_base.evbase, &cd->ev,
-1, OPAL_EV_WRITE, reg_thread, cd);
opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI);
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);

Просмотреть файл

@ -28,6 +28,7 @@
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/errhandler/opal_errhandler.h"
#include "opal/mca/pmix/pmix.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
@ -71,6 +72,33 @@
static void proc_errors(int fd, short args, void *cbdata);
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
{
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_app: errhandler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* push it into our event base */
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
}
static int myerrhandle = -1;
static void register_cbfunc(int status, int errhndler, void *cbdata)
{
myerrhandle = errhndler;
}
static void notify_cbfunc(int status,
opal_list_t *procs,
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc,
void *cbdata)
{
if (NULL != cbfunc) {
cbfunc(cbdata);
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_app: pmix errhandler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* push it into our event base */
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
}
@ -86,13 +114,16 @@ static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
/* register an errhandler */
opal_register_errhandler(pmix_error, NULL);
/* tie the default PMIx errhandler back to us */
opal_pmix.register_errhandler(NULL, notify_cbfunc, register_cbfunc, NULL);
return ORTE_SUCCESS;
}
static int finalize(void)
{
opal_deregister_errhandler();
opal_pmix.deregister_errhandler(myerrhandle, NULL, NULL);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@ -46,7 +46,6 @@
#include "opal/util/proc.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/runtime/opal_progress_threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/dfs/base/base.h"
@ -70,8 +69,6 @@
#include "orte/mca/ess/base/base.h"
static bool progress_thread_running = false;
int orte_ess_base_app_setup(bool db_restrict_local)
{
int ret;
@ -109,10 +106,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
opal_proc_local_set(&orte_process_info.super);
}
/* get an async event base - we use the opal_async one so
* we don't startup extra threads if not needed */
orte_event_base = opal_progress_thread_init(NULL);
progress_thread_running = true;
/* open and setup the state machine */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -235,12 +228,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
}
return ORTE_SUCCESS;
error:
if (!progress_thread_running) {
/* can't send the help message, so ensure it
* comes out locally
*/
orte_show_help_finalize();
}
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
@ -265,12 +252,6 @@ int orte_ess_base_app_finalize(void)
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
/* release the event base */
if (progress_thread_running) {
opal_progress_thread_finalize(NULL);
progress_thread_running = false;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -522,6 +522,8 @@ int orte_ess_base_orted_setup(char **hosts)
error = "opal_pmix_base_select";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);
/* setup the PMIx server */
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -630,6 +630,8 @@ static int rte_init(void)
error = "opal_pmix_base_select";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);
/* setup the routed info - the selected routed component
* will know what to do.

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -39,6 +39,7 @@
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/printf.h"
@ -73,6 +74,7 @@ orte_ess_base_module_t orte_ess_pmi_module = {
static bool added_transport_keys=false;
static bool added_num_procs = false;
static bool added_app_ctx = false;
static bool progress_thread_running = false;
/**** MODULE FUNCTIONS ****/
@ -97,6 +99,11 @@ static int rte_init(void)
goto error;
}
/* get an async event base - we use the opal_async one so
* we don't startup extra threads if not needed */
orte_event_base = opal_progress_thread_init(NULL);
progress_thread_running = true;
/* open and setup pmix */
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -109,6 +116,8 @@ static int rte_init(void)
error = "pmix init";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);
/* initialize the selected module */
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
/* we cannot run */
@ -394,6 +403,12 @@ static int rte_init(void)
return ORTE_SUCCESS;
error:
if (!progress_thread_running) {
/* can't send the help message, so ensure it
* comes out locally
*/
orte_show_help_finalize();
}
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
@ -419,18 +434,23 @@ static int rte_finalize(void)
unsetenv("OMPI_APP_CTX_NUM_PROCS");
}
/* mark us as finalized */
if (NULL != opal_pmix.finalize) {
opal_pmix.finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework);
}
/* use the default app procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* mark us as finalized */
if (NULL != opal_pmix.finalize) {
opal_pmix.finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework);
}
/* release the event base */
if (progress_thread_running) {
opal_progress_thread_finalize(NULL);
progress_thread_running = false;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -415,14 +415,16 @@ static int do_child(orte_app_context_t* context,
always outputs a nice, single message indicating what
happened
*/
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt",
"iof setup failed",
orte_process_info.nodename, context->app);
/* Does not return */
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt",
"iof setup failed",
orte_process_info.nodename, context->app);
/* Does not return */
}
}
/* now set any child-level controls such as binding */

Просмотреть файл

@ -1282,18 +1282,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, "1");
}
/* the following two are not mca params */
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
opal_argv_append(argc, argv, "--debug-failure");
asprintf(&param, "%d", orted_debug_failure);
opal_argv_append(argc, argv, param);
free(param);
}
if (0 < orted_debug_failure_delay) {
opal_argv_append(argc, argv, "--debug-failure-delay");
asprintf(&param, "%d", orted_debug_failure_delay);
opal_argv_append(argc, argv, param);
free(param);
/* the following is not an mca param */
if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
opal_argv_append(argc, argv, "--test-suicide");
}
/* tell the orted what ESS component to use */

Просмотреть файл

@ -122,12 +122,11 @@ static struct {
char* num_procs;
int uri_pipe;
int singleton_died_pipe;
int fail;
int fail_delay;
bool abort;
bool mapreduce;
bool tree_spawn;
char *hnp_topo_sig;
bool test_suicide;
} orted_globals;
/*
@ -143,13 +142,9 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL,
"Have the orted spin until we can connect a debugger to it" },
{ NULL, '\0', NULL, "debug-failure", 1,
&orted_globals.fail, OPAL_CMD_LINE_TYPE_INT,
"Have the specified orted fail after init for debugging purposes" },
{ NULL, '\0', NULL, "debug-failure-delay", 1,
&orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT,
"Have the orted specified for failure delay for the provided number of seconds before failing" },
{ NULL, '\0', NULL, "test-suicide", 1,
&orted_globals.test_suicide, OPAL_CMD_LINE_TYPE_BOOL,
"Suicide instead of clean abort after delay" },
{ "orte_debug", 'd', NULL, "debug", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
@ -246,8 +241,6 @@ int orte_daemon(int argc, char *argv[])
memset(&orted_globals, 0, sizeof(orted_globals));
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
orted_globals.singleton_died_pipe = -1;
/* init the failure orted vpid to an invalid value */
orted_globals.fail = ORTE_VPID_INVALID;
/* setup to check common command line options that just report and die */
cmd_line = OBJ_NEW(opal_cmd_line_t);
@ -428,23 +421,23 @@ int orte_daemon(int argc, char *argv[])
}
}
if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
orted_globals.abort=false;
/* some vpid was ordered to fail. The value can be positive
* or negative, depending upon the desired method for failure,
* so need to check both here
*/
if (0 > orted_globals.fail) {
orted_globals.fail = -1*orted_globals.fail;
if (0 > orted_debug_failure) {
orted_debug_failure = -1*orted_debug_failure;
orted_globals.abort = true;
}
/* are we the specified vpid? */
if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
/* if the user specified we delay, then setup a timer
* and have it kill us
*/
if (0 < orted_globals.fail_delay) {
ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
if (0 < orted_debug_failure_delay) {
ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);
} else {
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -901,11 +894,15 @@ static void shutdown_callback(int fd, short flags, void *arg)
/* if we were ordered to abort, do so */
if (orted_globals.abort) {
opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_output(0, "%s is executing %s abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(orted_globals.test_suicide) ? "suicide" : "clean");
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just kill our
* local procs, forcibly cleanup the local session_dir tree, and abort
*/
if (orted_globals.test_suicide) {
exit(1);
}
orte_odls.kill_local_procs(NULL);
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
abort();

Просмотреть файл

@ -1066,6 +1066,14 @@ int orterun(int argc, char *argv[])
}
}
/* check for suicide test directives */
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
/* don't forward IO from this process so we can
* see any debug after daemon termination */
ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT);
}
/* check for a job timeout specification, to be provided in seconds
* as that is what MPICH used
*/
@ -2862,6 +2870,11 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
orte_show_help("help-orterun.txt", "orterun:timeout",
true, (NULL == tm) ? "NULL" : tm);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* if we are testing HNP suicide, then just exit */
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE")) {
opal_output(0, "HNP exiting w/o cleanup");
exit(1);
}
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */