Merge pull request #1423 from rhc54/topic/suicide
Fix registration of error handlers thru the pmix120 component.
Этот коммит содержится в:
Коммит
d38e2e6655
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
|
|||||||
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
|
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
|
||||||
opal_pmix_pdata_t *pdat,
|
opal_pmix_pdata_t *pdat,
|
||||||
int timeout);
|
int timeout);
|
||||||
|
|
||||||
|
OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
opal_event_base_t *evbase;
|
||||||
|
} opal_pmix_base_t;
|
||||||
|
|
||||||
|
extern opal_pmix_base_t opal_pmix_base;
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -38,6 +38,11 @@
|
|||||||
|
|
||||||
#define OPAL_PMI_PAD 10
|
#define OPAL_PMI_PAD 10
|
||||||
|
|
||||||
|
void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
|
||||||
|
{
|
||||||
|
opal_pmix_base.evbase = evbase;
|
||||||
|
}
|
||||||
|
|
||||||
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
|
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
|
||||||
******** DO NOT NATIVELY SUPPORT IT
|
******** DO NOT NATIVELY SUPPORT IT
|
||||||
********/
|
********/
|
||||||
|
@ -33,9 +33,9 @@
|
|||||||
https://github.com/open-mpi/ompi/issues/375 for details. */
|
https://github.com/open-mpi/ompi/issues/375 for details. */
|
||||||
opal_pmix_base_module_t opal_pmix = { 0 };
|
opal_pmix_base_module_t opal_pmix = { 0 };
|
||||||
bool opal_pmix_collect_all_data = true;
|
bool opal_pmix_collect_all_data = true;
|
||||||
bool opal_pmix_base_allow_delayed_server = false;
|
|
||||||
int opal_pmix_verbose_output = -1;
|
int opal_pmix_verbose_output = -1;
|
||||||
bool opal_pmix_base_async_modex = false;
|
bool opal_pmix_base_async_modex = false;
|
||||||
|
opal_pmix_base_t opal_pmix_base = {0};
|
||||||
|
|
||||||
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
|
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
|
||||||
{
|
{
|
||||||
|
@ -212,9 +212,9 @@ static void reg_thread(int sd, short args, void *cbdata)
|
|||||||
opal_pmix120_etracker_t *trk;
|
opal_pmix120_etracker_t *trk;
|
||||||
|
|
||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s register complete with status %d",
|
"%s register complete with status %d ref %d",
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
cd->status);
|
cd->status, cd->errhandler_ref);
|
||||||
|
|
||||||
/* convert the status */
|
/* convert the status */
|
||||||
rc = pmix120_convert_rc(cd->status);
|
rc = pmix120_convert_rc(cd->status);
|
||||||
@ -240,9 +240,10 @@ static void reg_cbfunc(pmix_status_t status,
|
|||||||
void *cbdata)
|
void *cbdata)
|
||||||
{
|
{
|
||||||
pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata;
|
pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata;
|
||||||
|
|
||||||
cd->status = status;
|
cd->status = status;
|
||||||
cd->errhandler_ref = errhandler_ref;
|
cd->errhandler_ref = errhandler_ref;
|
||||||
opal_event_set(opal_sync_event_base, &cd->ev,
|
opal_event_set(opal_pmix_base.evbase, &cd->ev,
|
||||||
-1, OPAL_EV_WRITE, reg_thread, cd);
|
-1, OPAL_EV_WRITE, reg_thread, cd);
|
||||||
opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI);
|
opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI);
|
||||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
#include "opal/errhandler/opal_errhandler.h"
|
#include "opal/errhandler/opal_errhandler.h"
|
||||||
|
#include "opal/mca/pmix/pmix.h"
|
||||||
|
|
||||||
#include "orte/util/error_strings.h"
|
#include "orte/util/error_strings.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
@ -71,6 +72,33 @@
|
|||||||
static void proc_errors(int fd, short args, void *cbdata);
|
static void proc_errors(int fd, short args, void *cbdata);
|
||||||
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
|
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
|
||||||
{
|
{
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||||
|
"%s errmgr:default_app: errhandler called",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
|
/* push it into our event base */
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int myerrhandle = -1;
|
||||||
|
|
||||||
|
static void register_cbfunc(int status, int errhndler, void *cbdata)
|
||||||
|
{
|
||||||
|
myerrhandle = errhndler;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void notify_cbfunc(int status,
|
||||||
|
opal_list_t *procs,
|
||||||
|
opal_list_t *info,
|
||||||
|
opal_pmix_release_cbfunc_t cbfunc,
|
||||||
|
void *cbdata)
|
||||||
|
{
|
||||||
|
if (NULL != cbfunc) {
|
||||||
|
cbfunc(cbdata);
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||||
|
"%s errmgr:default_app: pmix errhandler called",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
/* push it into our event base */
|
/* push it into our event base */
|
||||||
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
|
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
|
||||||
}
|
}
|
||||||
@ -86,13 +114,16 @@ static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
|
|||||||
/* register an errhandler */
|
/* register an errhandler */
|
||||||
opal_register_errhandler(pmix_error, NULL);
|
opal_register_errhandler(pmix_error, NULL);
|
||||||
|
|
||||||
|
/* tie the default PMIx errhandler back to us */
|
||||||
|
opal_pmix.register_errhandler(NULL, notify_cbfunc, register_cbfunc, NULL);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int finalize(void)
|
static int finalize(void)
|
||||||
{
|
{
|
||||||
opal_deregister_errhandler();
|
opal_deregister_errhandler();
|
||||||
|
opal_pmix.deregister_errhandler(myerrhandle, NULL, NULL);
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Research Organization for Information Science
|
* Copyright (c) 2014 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||||
@ -46,7 +46,6 @@
|
|||||||
#include "opal/util/proc.h"
|
#include "opal/util/proc.h"
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "opal/runtime/opal_cr.h"
|
#include "opal/runtime/opal_cr.h"
|
||||||
#include "opal/runtime/opal_progress_threads.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/dfs/base/base.h"
|
#include "orte/mca/dfs/base/base.h"
|
||||||
@ -70,8 +69,6 @@
|
|||||||
|
|
||||||
#include "orte/mca/ess/base/base.h"
|
#include "orte/mca/ess/base/base.h"
|
||||||
|
|
||||||
static bool progress_thread_running = false;
|
|
||||||
|
|
||||||
int orte_ess_base_app_setup(bool db_restrict_local)
|
int orte_ess_base_app_setup(bool db_restrict_local)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
@ -109,10 +106,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
opal_proc_local_set(&orte_process_info.super);
|
opal_proc_local_set(&orte_process_info.super);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get an async event base - we use the opal_async one so
|
|
||||||
* we don't startup extra threads if not needed */
|
|
||||||
orte_event_base = opal_progress_thread_init(NULL);
|
|
||||||
progress_thread_running = true;
|
|
||||||
/* open and setup the state machine */
|
/* open and setup the state machine */
|
||||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
@ -235,12 +228,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
error:
|
error:
|
||||||
if (!progress_thread_running) {
|
|
||||||
/* can't send the help message, so ensure it
|
|
||||||
* comes out locally
|
|
||||||
*/
|
|
||||||
orte_show_help_finalize();
|
|
||||||
}
|
|
||||||
orte_show_help("help-orte-runtime.txt",
|
orte_show_help("help-orte-runtime.txt",
|
||||||
"orte_init:startup:internal-failure",
|
"orte_init:startup:internal-failure",
|
||||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||||
@ -265,12 +252,6 @@ int orte_ess_base_app_finalize(void)
|
|||||||
|
|
||||||
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
||||||
|
|
||||||
/* release the event base */
|
|
||||||
if (progress_thread_running) {
|
|
||||||
opal_progress_thread_finalize(NULL);
|
|
||||||
progress_thread_running = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -522,6 +522,8 @@ int orte_ess_base_orted_setup(char **hosts)
|
|||||||
error = "opal_pmix_base_select";
|
error = "opal_pmix_base_select";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
/* set the event base */
|
||||||
|
opal_pmix_base_set_evbase(orte_event_base);
|
||||||
/* setup the PMIx server */
|
/* setup the PMIx server */
|
||||||
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
|
@ -630,6 +630,8 @@ static int rte_init(void)
|
|||||||
error = "opal_pmix_base_select";
|
error = "opal_pmix_base_select";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
/* set the event base */
|
||||||
|
opal_pmix_base_set_evbase(orte_event_base);
|
||||||
|
|
||||||
/* setup the routed info - the selected routed component
|
/* setup the routed info - the selected routed component
|
||||||
* will know what to do.
|
* will know what to do.
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -39,6 +39,7 @@
|
|||||||
#include "opal/util/opal_environ.h"
|
#include "opal/util/opal_environ.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
|
#include "opal/runtime/opal_progress_threads.h"
|
||||||
#include "opal/class/opal_pointer_array.h"
|
#include "opal/class/opal_pointer_array.h"
|
||||||
#include "opal/mca/hwloc/base/base.h"
|
#include "opal/mca/hwloc/base/base.h"
|
||||||
#include "opal/util/printf.h"
|
#include "opal/util/printf.h"
|
||||||
@ -73,6 +74,7 @@ orte_ess_base_module_t orte_ess_pmi_module = {
|
|||||||
static bool added_transport_keys=false;
|
static bool added_transport_keys=false;
|
||||||
static bool added_num_procs = false;
|
static bool added_num_procs = false;
|
||||||
static bool added_app_ctx = false;
|
static bool added_app_ctx = false;
|
||||||
|
static bool progress_thread_running = false;
|
||||||
|
|
||||||
/**** MODULE FUNCTIONS ****/
|
/**** MODULE FUNCTIONS ****/
|
||||||
|
|
||||||
@ -97,6 +99,11 @@ static int rte_init(void)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* get an async event base - we use the opal_async one so
|
||||||
|
* we don't startup extra threads if not needed */
|
||||||
|
orte_event_base = opal_progress_thread_init(NULL);
|
||||||
|
progress_thread_running = true;
|
||||||
|
|
||||||
/* open and setup pmix */
|
/* open and setup pmix */
|
||||||
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
|
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
@ -109,6 +116,8 @@ static int rte_init(void)
|
|||||||
error = "pmix init";
|
error = "pmix init";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
/* set the event base */
|
||||||
|
opal_pmix_base_set_evbase(orte_event_base);
|
||||||
/* initialize the selected module */
|
/* initialize the selected module */
|
||||||
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
|
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
|
||||||
/* we cannot run */
|
/* we cannot run */
|
||||||
@ -394,6 +403,12 @@ static int rte_init(void)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
if (!progress_thread_running) {
|
||||||
|
/* can't send the help message, so ensure it
|
||||||
|
* comes out locally
|
||||||
|
*/
|
||||||
|
orte_show_help_finalize();
|
||||||
|
}
|
||||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||||
orte_show_help("help-orte-runtime.txt",
|
orte_show_help("help-orte-runtime.txt",
|
||||||
"orte_init:startup:internal-failure",
|
"orte_init:startup:internal-failure",
|
||||||
@ -419,18 +434,23 @@ static int rte_finalize(void)
|
|||||||
unsetenv("OMPI_APP_CTX_NUM_PROCS");
|
unsetenv("OMPI_APP_CTX_NUM_PROCS");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* mark us as finalized */
|
|
||||||
if (NULL != opal_pmix.finalize) {
|
|
||||||
opal_pmix.finalize();
|
|
||||||
(void) mca_base_framework_close(&opal_pmix_base_framework);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* use the default app procedure to finish */
|
/* use the default app procedure to finish */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* mark us as finalized */
|
||||||
|
if (NULL != opal_pmix.finalize) {
|
||||||
|
opal_pmix.finalize();
|
||||||
|
(void) mca_base_framework_close(&opal_pmix_base_framework);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* release the event base */
|
||||||
|
if (progress_thread_running) {
|
||||||
|
opal_progress_thread_finalize(NULL);
|
||||||
|
progress_thread_running = false;
|
||||||
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -415,14 +415,16 @@ static int do_child(orte_app_context_t* context,
|
|||||||
always outputs a nice, single message indicating what
|
always outputs a nice, single message indicating what
|
||||||
happened
|
happened
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
&environ_copy))) {
|
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
||||||
ORTE_ERROR_LOG(i);
|
&environ_copy))) {
|
||||||
send_error_show_help(write_fd, 1,
|
ORTE_ERROR_LOG(i);
|
||||||
"help-orte-odls-default.txt",
|
send_error_show_help(write_fd, 1,
|
||||||
"iof setup failed",
|
"help-orte-odls-default.txt",
|
||||||
orte_process_info.nodename, context->app);
|
"iof setup failed",
|
||||||
/* Does not return */
|
orte_process_info.nodename, context->app);
|
||||||
|
/* Does not return */
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now set any child-level controls such as binding */
|
/* now set any child-level controls such as binding */
|
||||||
|
@ -1282,18 +1282,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
|||||||
opal_argv_append(argc, argv, "1");
|
opal_argv_append(argc, argv, "1");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* the following two are not mca params */
|
/* the following is not an mca param */
|
||||||
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
|
if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
||||||
opal_argv_append(argc, argv, "--debug-failure");
|
opal_argv_append(argc, argv, "--test-suicide");
|
||||||
asprintf(¶m, "%d", orted_debug_failure);
|
|
||||||
opal_argv_append(argc, argv, param);
|
|
||||||
free(param);
|
|
||||||
}
|
|
||||||
if (0 < orted_debug_failure_delay) {
|
|
||||||
opal_argv_append(argc, argv, "--debug-failure-delay");
|
|
||||||
asprintf(¶m, "%d", orted_debug_failure_delay);
|
|
||||||
opal_argv_append(argc, argv, param);
|
|
||||||
free(param);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* tell the orted what ESS component to use */
|
/* tell the orted what ESS component to use */
|
||||||
|
@ -122,12 +122,11 @@ static struct {
|
|||||||
char* num_procs;
|
char* num_procs;
|
||||||
int uri_pipe;
|
int uri_pipe;
|
||||||
int singleton_died_pipe;
|
int singleton_died_pipe;
|
||||||
int fail;
|
|
||||||
int fail_delay;
|
|
||||||
bool abort;
|
bool abort;
|
||||||
bool mapreduce;
|
bool mapreduce;
|
||||||
bool tree_spawn;
|
bool tree_spawn;
|
||||||
char *hnp_topo_sig;
|
char *hnp_topo_sig;
|
||||||
|
bool test_suicide;
|
||||||
} orted_globals;
|
} orted_globals;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -143,13 +142,9 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
|||||||
&orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL,
|
&orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Have the orted spin until we can connect a debugger to it" },
|
"Have the orted spin until we can connect a debugger to it" },
|
||||||
|
|
||||||
{ NULL, '\0', NULL, "debug-failure", 1,
|
{ NULL, '\0', NULL, "test-suicide", 1,
|
||||||
&orted_globals.fail, OPAL_CMD_LINE_TYPE_INT,
|
&orted_globals.test_suicide, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Have the specified orted fail after init for debugging purposes" },
|
"Suicide instead of clean abort after delay" },
|
||||||
|
|
||||||
{ NULL, '\0', NULL, "debug-failure-delay", 1,
|
|
||||||
&orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT,
|
|
||||||
"Have the orted specified for failure delay for the provided number of seconds before failing" },
|
|
||||||
|
|
||||||
{ "orte_debug", 'd', NULL, "debug", 0,
|
{ "orte_debug", 'd', NULL, "debug", 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
@ -246,8 +241,6 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
memset(&orted_globals, 0, sizeof(orted_globals));
|
memset(&orted_globals, 0, sizeof(orted_globals));
|
||||||
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
|
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
|
||||||
orted_globals.singleton_died_pipe = -1;
|
orted_globals.singleton_died_pipe = -1;
|
||||||
/* init the failure orted vpid to an invalid value */
|
|
||||||
orted_globals.fail = ORTE_VPID_INVALID;
|
|
||||||
|
|
||||||
/* setup to check common command line options that just report and die */
|
/* setup to check common command line options that just report and die */
|
||||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||||
@ -428,23 +421,23 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
|
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
|
||||||
orted_globals.abort=false;
|
orted_globals.abort=false;
|
||||||
/* some vpid was ordered to fail. The value can be positive
|
/* some vpid was ordered to fail. The value can be positive
|
||||||
* or negative, depending upon the desired method for failure,
|
* or negative, depending upon the desired method for failure,
|
||||||
* so need to check both here
|
* so need to check both here
|
||||||
*/
|
*/
|
||||||
if (0 > orted_globals.fail) {
|
if (0 > orted_debug_failure) {
|
||||||
orted_globals.fail = -1*orted_globals.fail;
|
orted_debug_failure = -1*orted_debug_failure;
|
||||||
orted_globals.abort = true;
|
orted_globals.abort = true;
|
||||||
}
|
}
|
||||||
/* are we the specified vpid? */
|
/* are we the specified vpid? */
|
||||||
if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
|
if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
|
||||||
/* if the user specified we delay, then setup a timer
|
/* if the user specified we delay, then setup a timer
|
||||||
* and have it kill us
|
* and have it kill us
|
||||||
*/
|
*/
|
||||||
if (0 < orted_globals.fail_delay) {
|
if (0 < orted_debug_failure_delay) {
|
||||||
ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
|
ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -901,11 +894,15 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
|||||||
|
|
||||||
/* if we were ordered to abort, do so */
|
/* if we were ordered to abort, do so */
|
||||||
if (orted_globals.abort) {
|
if (orted_globals.abort) {
|
||||||
opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
opal_output(0, "%s is executing %s abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
(orted_globals.test_suicide) ? "suicide" : "clean");
|
||||||
/* do -not- call finalize as this will send a message to the HNP
|
/* do -not- call finalize as this will send a message to the HNP
|
||||||
* indicating clean termination! Instead, just kill our
|
* indicating clean termination! Instead, just kill our
|
||||||
* local procs, forcibly cleanup the local session_dir tree, and abort
|
* local procs, forcibly cleanup the local session_dir tree, and abort
|
||||||
*/
|
*/
|
||||||
|
if (orted_globals.test_suicide) {
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
orte_odls.kill_local_procs(NULL);
|
orte_odls.kill_local_procs(NULL);
|
||||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||||
abort();
|
abort();
|
||||||
|
@ -1066,6 +1066,14 @@ int orterun(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for suicide test directives */
|
||||||
|
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
|
||||||
|
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
||||||
|
/* don't forward IO from this process so we can
|
||||||
|
* see any debug after daemon termination */
|
||||||
|
ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT);
|
||||||
|
}
|
||||||
|
|
||||||
/* check for a job timeout specification, to be provided in seconds
|
/* check for a job timeout specification, to be provided in seconds
|
||||||
* as that is what MPICH used
|
* as that is what MPICH used
|
||||||
*/
|
*/
|
||||||
@ -2862,6 +2870,11 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
|
|||||||
orte_show_help("help-orterun.txt", "orterun:timeout",
|
orte_show_help("help-orterun.txt", "orterun:timeout",
|
||||||
true, (NULL == tm) ? "NULL" : tm);
|
true, (NULL == tm) ? "NULL" : tm);
|
||||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
|
/* if we are testing HNP suicide, then just exit */
|
||||||
|
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE")) {
|
||||||
|
opal_output(0, "HNP exiting w/o cleanup");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
/* abort the job */
|
/* abort the job */
|
||||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||||
/* set the global abnormal exit flag */
|
/* set the global abnormal exit flag */
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user