From 4a55fba4147af996813e8e1d8df62b64b94efca3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 2 Mar 2016 14:30:57 -0800 Subject: [PATCH] Fix registration of error handlers thru the pmix120 component. A thread-shift operation was hanging on the sync_event_base, which made it dependent on someone calling opal_progress. Unfortunately, a process in "sleep" or spinning outside the MPI library won't do that, and so we never complete errhandler registration. --- opal/mca/pmix/base/base.h | 11 +++++- opal/mca/pmix/base/pmix_base_fns.c | 5 +++ opal/mca/pmix/base/pmix_base_frame.c | 2 +- opal/mca/pmix/pmix120/pmix_pmix120.c | 7 ++-- .../errmgr/default_app/errmgr_default_app.c | 33 +++++++++++++++++- orte/mca/ess/base/ess_base_std_app.c | 21 +----------- orte/mca/ess/base/ess_base_std_orted.c | 2 ++ orte/mca/ess/hnp/ess_hnp_module.c | 2 ++ orte/mca/ess/pmi/ess_pmi_module.c | 34 +++++++++++++++---- orte/mca/odls/default/odls_default_module.c | 18 +++++----- orte/mca/plm/base/plm_base_launch_support.c | 15 ++------ orte/orted/orted_main.c | 33 ++++++++---------- orte/tools/orterun/orterun.c | 13 +++++++ 13 files changed, 125 insertions(+), 71 deletions(-) diff --git a/opal/mca/pmix/base/base.h b/opal/mca/pmix/base/base.h index 1551f8eb4d..c1aec6e4e3 100644 --- a/opal/mca/pmix/base/base.h +++ b/opal/mca/pmix/base/base.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status, OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info, opal_pmix_pdata_t *pdat, int timeout); + +OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase); + +typedef struct { + opal_event_base_t *evbase; +} opal_pmix_base_t; + +extern opal_pmix_base_t opal_pmix_base; + END_C_DECLS #endif diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index 9ffba7b33b..57e873385a 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -38,6 +38,11 @@ #define OPAL_PMI_PAD 10 +void opal_pmix_base_set_evbase(opal_event_base_t *evbase) +{ + opal_pmix_base.evbase = evbase; +} + /******** ERRHANDLER SUPPORT FOR COMPONENTS THAT ******** DO NOT NATIVELY SUPPORT IT ********/ diff --git a/opal/mca/pmix/base/pmix_base_frame.c b/opal/mca/pmix/base/pmix_base_frame.c index e75e378d4d..9eacc57176 100644 --- a/opal/mca/pmix/base/pmix_base_frame.c +++ b/opal/mca/pmix/base/pmix_base_frame.c @@ -33,9 +33,9 @@ https://github.com/open-mpi/ompi/issues/375 for details. */ opal_pmix_base_module_t opal_pmix = { 0 }; bool opal_pmix_collect_all_data = true; -bool opal_pmix_base_allow_delayed_server = false; int opal_pmix_verbose_output = -1; bool opal_pmix_base_async_modex = false; +opal_pmix_base_t opal_pmix_base = {0}; static int opal_pmix_base_frame_register(mca_base_register_flag_t flags) { diff --git a/opal/mca/pmix/pmix120/pmix_pmix120.c b/opal/mca/pmix/pmix120/pmix_pmix120.c index aad5125a26..7b37620250 100644 --- a/opal/mca/pmix/pmix120/pmix_pmix120.c +++ b/opal/mca/pmix/pmix120/pmix_pmix120.c @@ -212,9 +212,9 @@ static void reg_thread(int sd, short args, void *cbdata) opal_pmix120_etracker_t *trk; opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s register complete with status %d", + "%s register complete with status %d ref %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - cd->status); + cd->status, cd->errhandler_ref); /* convert the status */ rc = pmix120_convert_rc(cd->status); @@ -240,9 +240,10 @@ static void reg_cbfunc(pmix_status_t status, void *cbdata) { pmix120_opcaddy_t *cd = (pmix120_opcaddy_t*)cbdata; + cd->status = status; cd->errhandler_ref = errhandler_ref; - opal_event_set(opal_sync_event_base, &cd->ev, + opal_event_set(opal_pmix_base.evbase, &cd->ev, -1, OPAL_EV_WRITE, reg_thread, cd); opal_event_set_priority(&cd->ev, OPAL_EV_MSG_HI_PRI); opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); diff --git a/orte/mca/errmgr/default_app/errmgr_default_app.c b/orte/mca/errmgr/default_app/errmgr_default_app.c index 6f6a899cbd..c1ee58ee99 100644 --- a/orte/mca/errmgr/default_app/errmgr_default_app.c +++ b/orte/mca/errmgr/default_app/errmgr_default_app.c @@ -28,6 +28,7 @@ #include "opal/util/output.h" #include "opal/dss/dss.h" #include "opal/errhandler/opal_errhandler.h" +#include "opal/mca/pmix/pmix.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" @@ -71,6 +72,33 @@ static void proc_errors(int fd, short args, void *cbdata); static void pmix_error(int error, opal_proc_t *proc, void *cbdata) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, + "%s errmgr:default_app: errhandler called", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* push it into our event base */ + ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED); +} + +static int myerrhandle = -1; + +static void register_cbfunc(int status, int errhndler, void *cbdata) +{ + myerrhandle = errhndler; +} + +static void notify_cbfunc(int status, + opal_list_t *procs, + opal_list_t *info, + opal_pmix_release_cbfunc_t cbfunc, + void *cbdata) +{ + if (NULL != cbfunc) { + cbfunc(cbdata); + } + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, + "%s errmgr:default_app: pmix errhandler called", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* push it into our event base */ ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED); } @@ -86,13 +114,16 @@ static void pmix_error(int error, opal_proc_t *proc, void *cbdata) /* register an errhandler */ opal_register_errhandler(pmix_error, NULL); + /* tie the default PMIx errhandler back to us */ + opal_pmix.register_errhandler(NULL, notify_cbfunc, register_cbfunc, NULL); + return ORTE_SUCCESS; } static int finalize(void) { opal_deregister_errhandler(); - + opal_pmix.deregister_errhandler(myerrhandle, NULL, NULL); return ORTE_SUCCESS; } diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index ab3a09b864..f300456e4e 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -12,7 +12,7 @@ * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. @@ -46,7 +46,6 @@ #include "opal/util/proc.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_cr.h" -#include "opal/runtime/opal_progress_threads.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/dfs/base/base.h" @@ -70,8 +69,6 @@ #include "orte/mca/ess/base/base.h" -static bool progress_thread_running = false; - int orte_ess_base_app_setup(bool db_restrict_local) { int ret; @@ -109,10 +106,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) opal_proc_local_set(&orte_process_info.super); } - /* get an async event base - we use the opal_async one so - * we don't startup extra threads if not needed */ - orte_event_base = opal_progress_thread_init(NULL); - progress_thread_running = true; /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -235,12 +228,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) } return ORTE_SUCCESS; error: - if (!progress_thread_running) { - /* can't send the help message, so ensure it - * comes out locally - */ - orte_show_help_finalize(); - } orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); @@ -265,12 +252,6 @@ int orte_ess_base_app_finalize(void) orte_session_dir_finalize(ORTE_PROC_MY_NAME); - /* release the event base */ - if (progress_thread_running) { - opal_progress_thread_finalize(NULL); - progress_thread_running = false; - } - return ORTE_SUCCESS; } diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 555692f6c6..deee99de3e 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -522,6 +522,8 @@ int orte_ess_base_orted_setup(char **hosts) error = "opal_pmix_base_select"; goto error; } + /* set the event base */ + opal_pmix_base_set_evbase(orte_event_base); /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { ORTE_ERROR_LOG(ret); diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 7b5d2c1067..6152103acc 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -630,6 +630,8 @@ static int rte_init(void) error = "opal_pmix_base_select"; goto error; } + /* set the event base */ + opal_pmix_base_set_evbase(orte_event_base); /* setup the routed info - the selected routed component * will know what to do. diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 8ff5b96ba4..798e3462e5 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -12,7 +12,7 @@ * Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,6 +39,7 @@ #include "opal/util/opal_environ.h" #include "opal/util/output.h" #include "opal/util/argv.h" +#include "opal/runtime/opal_progress_threads.h" #include "opal/class/opal_pointer_array.h" #include "opal/mca/hwloc/base/base.h" #include "opal/util/printf.h" @@ -73,6 +74,7 @@ orte_ess_base_module_t orte_ess_pmi_module = { static bool added_transport_keys=false; static bool added_num_procs = false; static bool added_app_ctx = false; +static bool progress_thread_running = false; /**** MODULE FUNCTIONS ****/ @@ -97,6 +99,11 @@ static int rte_init(void) goto error; } + /* get an async event base - we use the opal_async one so + * we don't startup extra threads if not needed */ + orte_event_base = opal_progress_thread_init(NULL); + progress_thread_running = true; + /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -109,6 +116,8 @@ static int rte_init(void) error = "pmix init"; goto error; } + /* set the event base */ + opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) { /* we cannot run */ @@ -394,6 +403,12 @@ static int rte_init(void) return ORTE_SUCCESS; error: + if (!progress_thread_running) { + /* can't send the help message, so ensure it + * comes out locally + */ + orte_show_help_finalize(); + } if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", @@ -419,18 +434,23 @@ static int rte_finalize(void) unsetenv("OMPI_APP_CTX_NUM_PROCS"); } - /* mark us as finalized */ - if (NULL != opal_pmix.finalize) { - opal_pmix.finalize(); - (void) mca_base_framework_close(&opal_pmix_base_framework); - } - /* use the default app procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); return ret; } + /* mark us as finalized */ + if (NULL != opal_pmix.finalize) { + opal_pmix.finalize(); + (void) mca_base_framework_close(&opal_pmix_base_framework); + } + + /* release the event base */ + if (progress_thread_running) { + opal_progress_thread_finalize(NULL); + progress_thread_running = false; + } return ORTE_SUCCESS; } diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 21af9f5bd3..c05f669464 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -415,14 +415,16 @@ static int do_child(orte_app_context_t* context, always outputs a nice, single message indicating what happened */ - if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts, - &environ_copy))) { - ORTE_ERROR_LOG(i); - send_error_show_help(write_fd, 1, - "help-orte-odls-default.txt", - "iof setup failed", - orte_process_info.nodename, context->app); - /* Does not return */ + if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { + if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts, + &environ_copy))) { + ORTE_ERROR_LOG(i); + send_error_show_help(write_fd, 1, + "help-orte-odls-default.txt", + "iof setup failed", + orte_process_info.nodename, context->app); + /* Does not return */ + } } /* now set any child-level controls such as binding */ diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 5241df29b0..b66583390f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1282,18 +1282,9 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, "1"); } - /* the following two are not mca params */ - if ((int)ORTE_VPID_INVALID != orted_debug_failure) { - opal_argv_append(argc, argv, "--debug-failure"); - asprintf(¶m, "%d", orted_debug_failure); - opal_argv_append(argc, argv, param); - free(param); - } - if (0 < orted_debug_failure_delay) { - opal_argv_append(argc, argv, "--debug-failure-delay"); - asprintf(¶m, "%d", orted_debug_failure_delay); - opal_argv_append(argc, argv, param); - free(param); + /* the following is not an mca param */ + if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { + opal_argv_append(argc, argv, "--test-suicide"); } /* tell the orted what ESS component to use */ diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index a34e22489c..eef05637d7 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -122,12 +122,11 @@ static struct { char* num_procs; int uri_pipe; int singleton_died_pipe; - int fail; - int fail_delay; bool abort; bool mapreduce; bool tree_spawn; char *hnp_topo_sig; + bool test_suicide; } orted_globals; /* @@ -143,13 +142,9 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_spin_flag, OPAL_CMD_LINE_TYPE_BOOL, "Have the orted spin until we can connect a debugger to it" }, - { NULL, '\0', NULL, "debug-failure", 1, - &orted_globals.fail, OPAL_CMD_LINE_TYPE_INT, - "Have the specified orted fail after init for debugging purposes" }, - - { NULL, '\0', NULL, "debug-failure-delay", 1, - &orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT, - "Have the orted specified for failure delay for the provided number of seconds before failing" }, + { NULL, '\0', NULL, "test-suicide", 1, + &orted_globals.test_suicide, OPAL_CMD_LINE_TYPE_BOOL, + "Suicide instead of clean abort after delay" }, { "orte_debug", 'd', NULL, "debug", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, @@ -246,8 +241,6 @@ int orte_daemon(int argc, char *argv[]) memset(&orted_globals, 0, sizeof(orted_globals)); /* initialize the singleton died pipe to an illegal value so we can detect it was set */ orted_globals.singleton_died_pipe = -1; - /* init the failure orted vpid to an invalid value */ - orted_globals.fail = ORTE_VPID_INVALID; /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); @@ -428,23 +421,23 @@ int orte_daemon(int argc, char *argv[]) } } - if ((int)ORTE_VPID_INVALID != orted_globals.fail) { + if ((int)ORTE_VPID_INVALID != orted_debug_failure) { orted_globals.abort=false; /* some vpid was ordered to fail. The value can be positive * or negative, depending upon the desired method for failure, * so need to check both here */ - if (0 > orted_globals.fail) { - orted_globals.fail = -1*orted_globals.fail; + if (0 > orted_debug_failure) { + orted_debug_failure = -1*orted_debug_failure; orted_globals.abort = true; } /* are we the specified vpid? */ - if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) { + if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) { /* if the user specified we delay, then setup a timer * and have it kill us */ - if (0 < orted_globals.fail_delay) { - ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI); + if (0 < orted_debug_failure_delay) { + ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -901,11 +894,15 @@ static void shutdown_callback(int fd, short flags, void *arg) /* if we were ordered to abort, do so */ if (orted_globals.abort) { - opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + opal_output(0, "%s is executing %s abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (orted_globals.test_suicide) ? "suicide" : "clean"); /* do -not- call finalize as this will send a message to the HNP * indicating clean termination! Instead, just kill our * local procs, forcibly cleanup the local session_dir tree, and abort */ + if (orted_globals.test_suicide) { + exit(1); + } orte_odls.kill_local_procs(NULL); orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); abort(); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index cbe7d07f48..d63008e49a 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -1066,6 +1066,14 @@ int orterun(int argc, char *argv[]) } } + /* check for suicide test directives */ + if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || + NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { + /* don't forward IO from this process so we can + * see any debug after daemon termination */ + ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT); + } + /* check for a job timeout specification, to be provided in seconds * as that is what MPICH used */ @@ -2862,6 +2870,11 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata) orte_show_help("help-orterun.txt", "orterun:timeout", true, (NULL == tm) ? "NULL" : tm); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + /* if we are testing HNP suicide, then just exit */ + if (NULL != getenv("ORTE_TEST_HNP_SUICIDE")) { + opal_output(0, "HNP exiting w/o cleanup"); + exit(1); + } /* abort the job */ ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); /* set the global abnormal exit flag */