diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 717729a205..d26a5a040c 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -27,12 +27,6 @@ #include "opal/util/opal_sos.h" #include "opal/dss/dss.h" -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_locks.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -43,6 +37,15 @@ #include "orte/mca/routed/routed.h" #include "orte/mca/debugger/base/base.h" +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_quit.h" + #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" @@ -296,8 +299,15 @@ static int update_state(orte_jobid_t job, /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + /* if the orteds are terminating, check job complete */ + if (orte_orteds_term_ordered) { + opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); + check_job_complete(NULL); + return ORTE_SUCCESS; + } else { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } } /* update is for a specific proc */ @@ -390,38 +400,48 @@ static int update_state(orte_jobid_t job, break; case ORTE_PROC_STATE_COMM_FAILED: - /* delete the route */ - orte_routed.delete_route(proc); - /* purge the oob */ - orte_rml.purge(proc); /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* if we have ordered orteds to terminate, see if this one failed to tell - * us it had terminated - */ + /* if this is my own connection, ignore it */ + if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s My own connection - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + break; + } + /* if we have ordered orteds to terminate, record it */ if (orte_orteds_term_ordered) { - if (orte_orted_exit_with_barrier) { - record_dead_daemon(jdata, proc->vpid, state, exit_code); - check_job_complete(jdata); - break; - } else { - record_dead_daemon(jdata, proc->vpid, state, 0); - check_job_complete(jdata); - break; - } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Daemons terminating - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + record_dead_daemon(jdata, proc->vpid, state, 0); + /* check for complete */ + check_job_complete(jdata); + break; } /* if abort is in progress, see if this one failed to tell * us it had terminated */ if (orte_abnormal_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Abort in progress - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ record_dead_daemon(jdata, proc->vpid, state, exit_code); + /* check for complete */ check_job_complete(jdata); break; } - /* if this is my own connection, ignore it */ - if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { - break; - } + /* delete the route */ + orte_routed.delete_route(proc); + /* purge the oob */ + orte_rml.purge(proc); + if (orte_enable_recovery) { /* relocate its processes */ if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { @@ -755,7 +775,14 @@ static void check_job_complete(orte_job_t *jdata) /* Check if FileM is active. If so then keep processing. */ OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); #endif - + if (NULL == jdata) { + /* just check to see if the daemons are complete */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_complete - received NULL job, checking daemons", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CHECK_DAEMONS; + } + for (i=0; i < jdata->procs->size && !jdata->abort; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { /* the proc array may no longer be left justified, so @@ -978,14 +1005,21 @@ static void check_job_complete(orte_job_t *jdata) * This can happen if a ctrl-c hits in the "wrong" place * while launching */ +CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - if (jdata->num_terminated >= jdata->num_procs) { + if (0 == orte_routed.num_routes()) { /* orteds are done! */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (NULL == jdata) { + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + } jdata->state = ORTE_JOB_STATE_TERMINATED; - orte_trigger_event(&orteds_exit); + orte_quit(); return; } + return; } /* Release the resources used by this job. Since some errmgrs may want @@ -1094,15 +1128,22 @@ static void check_job_complete(orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; } - /* if we get here, then all jobs are done, so wakeup */ + /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed all jobs terminated - waking up", + "%s errmgr:hnp:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); + /* if I am the only daemon alive, then I can exit now */ + if (0 == orte_routed.num_routes()) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + orte_quit(); + } } static void killprocs(orte_jobid_t job, orte_vpid_t vpid) diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index 99f750875e..b81916fa68 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -35,6 +35,7 @@ #include "orte/mca/plm/plm_types.h" #include "orte/mca/routed/routed.h" #include "orte/mca/sensor/sensor.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" @@ -244,16 +245,23 @@ static int update_state(orte_jobid_t job, ORTE_PROC_MY_NAME->vpid == proc->vpid) { return ORTE_SUCCESS; } - /* delete the route */ - orte_routed.delete_route(proc); - /* purge the oob */ - orte_rml.purge(proc); /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* tell the caller we can't recover */ - return ORTE_ERR_UNRECOVERABLE; + /* terminate - our routed children will see + * us leave and automatically die + */ + orte_quit(); + } + /* purge the oob */ + orte_rml.purge(proc); + /* was it a daemon that failed? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { + /* if all my routes are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes()) { + orte_quit(); + } } /* if not, then indicate we can continue */ return ORTE_SUCCESS; @@ -272,10 +280,17 @@ static int update_state(orte_jobid_t job, } } if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + /* must already be complete */ + return ORTE_SUCCESS; } + /* if there are no local procs for this job, we can + * ignore this call + */ + if (0 == jobdat->num_local_procs) { + return ORTE_SUCCESS; + } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted got state %s for proc %s pid %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 28ef0785c2..e64772003d 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -37,6 +37,7 @@ #include "opal/mca/pstat/base/base.h" #include "opal/mca/paffinity/base/base.h" #include "opal/mca/sysinfo/base/base.h" +#include "opal/util/os_path.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/routed/base/base.h" @@ -66,18 +67,61 @@ #include "orte/runtime/orte_cr.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/ess/base/base.h" -static bool plm_in_use; +/* local globals */ +static bool plm_in_use=false; +static bool signals_set=false; +static struct opal_event term_handler; +static struct opal_event int_handler; +static struct opal_event epipe_handler; +#ifndef __WINDOWS__ +static struct opal_event sigusr1_handler; +static struct opal_event sigusr2_handler; +#endif /* __WINDOWS__ */ +char *log_path = NULL; +static void shutdown_signal(int fd, short flags, void *arg); +static void signal_callback(int fd, short flags, void *arg); +static void epipe_signal_callback(int fd, short flags, void *arg); int orte_ess_base_orted_setup(char **hosts) { int ret; + int fd; + char log_file[PATH_MAX]; + char *jobidstring; char *error = NULL; char *plm_to_use; int value; +#ifndef __WINDOWS__ + /* setup callback for SIGPIPE */ + opal_signal_set(&epipe_handler, SIGPIPE, + epipe_signal_callback, &epipe_handler); + opal_signal_add(&epipe_handler, NULL); + /* Set signal handlers to catch kill signals so we can properly clean up + * after ourselves. + */ + opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL, + shutdown_signal, NULL); + opal_event_add(&term_handler, NULL); + opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL, + shutdown_signal, NULL); + opal_event_add(&int_handler, NULL); + + /** setup callbacks for signals we should ignore */ + opal_signal_set(&sigusr1_handler, SIGUSR1, + signal_callback, &sigusr1_handler); + opal_signal_add(&sigusr1_handler, NULL); + opal_signal_set(&sigusr2_handler, SIGUSR2, + signal_callback, &sigusr2_handler); + opal_signal_add(&sigusr2_handler, NULL); +#endif /* __WINDOWS__ */ + + signals_set = true; + /* initialize the global list of local children and job data */ OBJ_CONSTRUCT(&orte_local_children, opal_list_t); OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); @@ -321,10 +365,48 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } /* Once the session directory location has been established, set - the opal_output env file location to be in the - proc-specific session directory. */ + the opal_output env file location to be in the + proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); + + /* setup stdout/stderr */ + if (orte_debug_daemons_file_flag) { + /* if we are debugging to a file, then send stdout/stderr to + * the orted log file + */ + + /* get my jobid */ + if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, + ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ret); + error = "convert_jobid"; + goto error; + } + + /* define a log file name in the session directory */ + snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", + jobidstring, orte_process_info.nodename); + log_path = opal_os_path(false, + orte_process_info.tmpdir_base, + orte_process_info.top_session_dir, + log_file, + NULL); + + fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); + if (fd < 0) { + /* couldn't open the file for some reason, so + * just connect everything to /dev/null + */ + fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); + } else { + dup2(fd, STDOUT_FILENO); + dup2(fd, STDERR_FILENO); + if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { + close(fd); + } + } + } } /* setup the routed info - the selected routed component @@ -434,7 +516,7 @@ int orte_ess_base_orted_setup(char **hosts) return ORTE_SUCCESS; -error: + error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); @@ -447,14 +529,27 @@ int orte_ess_base_orted_finalize(void) /* stop the local sensors */ orte_sensor.stop(ORTE_PROC_MY_NAME->jobid); - /* ensure all the orteds depart together */ - if (!orte_abnormal_term_ordered) { - /* if we are abnormally terminating, don't attempt - * to do a barrier as nobody else will be entering - * that call - */ - orte_grpcomm.onesided_barrier(); + if (signals_set) { + /* Release all local signal handlers */ + opal_event_del(&epipe_handler); + opal_event_del(&term_handler); + opal_event_del(&int_handler); +#ifndef __WINDOWS__ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); +#endif /* __WINDOWS__ */ } + + /* cleanup */ + if (NULL != log_path) { + unlink(log_path); + } + + /* make sure our local procs are dead */ + orte_odls.kill_local_procs(NULL); + + /* whack any lingering session directory files from our jobs */ + orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); orte_sensor_base_close(); orte_db_base_close(); @@ -493,3 +588,29 @@ int orte_ess_base_orted_finalize(void) return ORTE_SUCCESS; } + +static void shutdown_signal(int fd, short flags, void *arg) +{ + /* trigger the call to shutdown callback to protect + * against race conditions - the trigger event will + * check the one-time lock + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_quit(); +} + +/** + * Deal with sigpipe errors + */ +static void epipe_signal_callback(int fd, short flags, void *arg) +{ + /* for now, we just announce and ignore them */ + opal_output(0, "%s reports a SIGPIPE error on fd %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd); + return; +} + +static void signal_callback(int fd, short event, void *arg) +{ + /* just ignore these signals */ +} diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index d85d174198..5ea7a30922 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -43,7 +43,6 @@ #include "opal/mca/paffinity/base/base.h" #include "opal/mca/sysinfo/base/base.h" -#include "orte/util/show_help.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/routed/base/base.h" @@ -53,6 +52,7 @@ #include "orte/mca/iof/base/base.h" #include "orte/mca/ras/base/base.h" #include "orte/mca/plm/base/base.h" +#include "orte/mca/plm/plm.h" #include "orte/mca/odls/base/base.h" #include "orte/mca/notifier/base/base.h" #include "orte/mca/rmcast/base/base.h" @@ -60,12 +60,14 @@ #include "orte/mca/sensor/base/base.h" #include "orte/mca/sensor/sensor.h" #include "orte/mca/debugger/base/base.h" - +#include "orte/mca/debugger/debugger.h" #include "orte/mca/rmaps/base/base.h" #if OPAL_ENABLE_FT_CR == 1 #include "orte/mca/snapc/base/base.h" #endif #include "orte/mca/filem/base/base.h" + +#include "orte/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/hnp_contact.h" @@ -76,8 +78,11 @@ #include "orte/runtime/runtime.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" - +#include "orte/runtime/orte_quit.h" #include "orte/runtime/orte_cr.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_data_server.h" + #include "orte/mca/ess/ess.h" #include "orte/mca/ess/base/base.h" #include "orte/mca/ess/hnp/ess_hnp.h" @@ -108,6 +113,23 @@ orte_ess_base_module_t orte_ess_hnp_module = { NULL /* ft_event */ }; +/* local globals */ +static bool signals_set=false; +static struct opal_event term_handler; +static struct opal_event int_handler; +static struct opal_event epipe_handler; +#ifndef __WINDOWS__ +static struct opal_event sigusr1_handler; +static struct opal_event sigusr2_handler; +static struct opal_event sigtstp_handler; +static struct opal_event sigcont_handler; +#endif /* __WINDOWS__ */ + +static void abort_signal_callback(int fd, short flags, void *arg); +static void abort_exit_callback(int fd, short event, void *arg); +static void epipe_signal_callback(int fd, short flags, void *arg); +static void signal_forward_callback(int fd, short event, void *arg); + static int rte_init(void) { int ret; @@ -124,6 +146,41 @@ static int rte_init(void) goto error; } +#ifndef __WINDOWS__ + /* setup callback for SIGPIPE */ + opal_signal_set(&epipe_handler, SIGPIPE, + epipe_signal_callback, &epipe_handler); + opal_signal_add(&epipe_handler, NULL); + /** setup callbacks for abort signals - from this point + * forward, we need to abort in a manner that allows us + * to cleanup + */ + opal_signal_set(&term_handler, SIGTERM, + abort_signal_callback, &term_handler); + opal_signal_add(&term_handler, NULL); + opal_signal_set(&int_handler, SIGINT, + abort_signal_callback, &int_handler); + opal_signal_add(&int_handler, NULL); + + /** setup callbacks for signals we should foward */ + opal_signal_set(&sigusr1_handler, SIGUSR1, + signal_forward_callback, &sigusr1_handler); + opal_signal_add(&sigusr1_handler, NULL); + opal_signal_set(&sigusr2_handler, SIGUSR2, + signal_forward_callback, &sigusr2_handler); + opal_signal_add(&sigusr2_handler, NULL); + if (orte_forward_job_control) { + opal_signal_set(&sigtstp_handler, SIGTSTP, + signal_forward_callback, &sigtstp_handler); + opal_signal_add(&sigtstp_handler, NULL); + opal_signal_set(&sigcont_handler, SIGCONT, + signal_forward_callback, &sigcont_handler); + opal_signal_add(&sigcont_handler, NULL); + } +#endif /* __WINDOWS__ */ + + signals_set = true; + /* determine the topology info */ if (0 == orte_default_num_sockets_per_board) { /* we weren't given a number, so try to determine it */ @@ -615,6 +672,24 @@ static int rte_finalize(void) orte_job_t *job; int i; + if (signals_set) { + /* Remove the epipe handler */ + opal_signal_del(&epipe_handler); + /* Remove the TERM and INT signal handlers */ + opal_signal_del(&term_handler); + opal_signal_del(&int_handler); +#ifndef __WINDOWS__ + /** Remove the USR signal handlers */ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); + if (orte_forward_job_control) { + opal_signal_del(&sigtstp_handler); + opal_signal_del(&sigcont_handler); + } +#endif /* __WINDOWS__ */ + signals_set = false; + } + /* stop the debuggers */ orte_debugger_base_close(); @@ -879,3 +954,132 @@ static int update_nidmap(opal_byte_object_t *bo) } return ORTE_SUCCESS; } + +static bool forcibly_die=false; + +static void abort_exit_callback(int fd, short ign, void *arg) +{ + int ret; + + fprintf(stderr, "%s: killing job...\n\n", orte_basename); + + /* since we are being terminated by a user's signal, be + * sure to exit with a non-zero exit code - but don't + * overwrite any error code from a proc that might have + * failed, in case that is why the user ordered us + * to terminate + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + + /* terminate the job - this will also wakeup orterun so + * it can report to the user and kill all the orteds. + * Check the jobid, though, just in case the user + * hit ctrl-c before we had a chance to setup the + * job in the system - in which case there is nothing + * to terminate! + */ + if (!orte_never_launched) { + /* if the debuggers were run, clean up */ + orte_debugger.finalize(); + + /* + * Turn off the process recovery functionality, if it was enabled. + * This keeps the errmgr from trying to recover from the shutdown + * procedure. + */ + orte_enable_recovery = false; + + /* terminate the orteds - they will automatically kill + * their local procs + */ + ret = orte_plm.terminate_orteds(); + + } else { + /* if the jobid is invalid or we never launched, + * there is nothing to do but just clean ourselves + * up and exit + */ + orte_quit(); + } +} + +/* + * Attempt to terminate the job and wait for callback indicating + * the job has been aborted. + */ +static void abort_signal_callback(int fd, short flags, void *arg) +{ + /* if we have already ordered this once, don't keep + * doing it to avoid race conditions + */ + if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ + if (forcibly_die) { + /* kill any local procs */ + orte_odls.kill_local_procs(NULL); + + /* whack any lingering session directory files from our jobs */ + orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); + + /* cleanup our data server */ + orte_data_server_finalize(); + + /* exit with a non-zero status */ + exit(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orte_basename); + forcibly_die = true; + return; + } + + /* set the global abnormal exit flag so we know not to + * use the standard xcast for terminating orteds + */ + orte_abnormal_term_ordered = true; + /* ensure that the forwarding of stdin stops */ + orte_job_term_ordered = true; + + /* tell us to be quiet - hey, the user killed us with a ctrl-c, + * so need to tell them that! + */ + orte_execute_quiet = true; + + /* We are in an event handler; the job completed procedure + will delete the signal handler that is currently running + (which is a Bad Thing), so we can't call it directly. + Instead, we have to exit this handler and setup to call + job_completed() after this. */ + ORTE_TIMER_EVENT(0, 0, abort_exit_callback); +} + +/** + * Deal with sigpipe errors + */ +static void epipe_signal_callback(int fd, short flags, void *arg) +{ + /* for now, we just announce and ignore them */ + OPAL_OUTPUT_VERBOSE((1, orte_debug_verbosity, + "%s reports a SIGPIPE error on fd %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd)); + return; +} + +/** + * Pass user signals to the remote application processes + */ +static void signal_forward_callback(int fd, short event, void *arg) +{ + struct opal_event *signal = (struct opal_event*)arg; + int signum, ret; + + signum = OPAL_EVENT_SIGNAL(signal); + if (!orte_execute_quiet){ + fprintf(stderr, "%s: Forwarding signal %d to job\n", + orte_basename, signum); + } + + /** send the signal out to the processes, including any descendants */ + if (ORTE_SUCCESS != (ret = orte_plm.signal_job(ORTE_JOBID_WILDCARD, signum))) { + fprintf(stderr, "Signal %d could not be sent to the job (returned %d)", + signum, ret); + } +} diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 10cc423543..aab288a9f7 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -164,8 +164,6 @@ static int rte_finalize(void) /* if I am a daemon, finalize using the default procedure */ if (ORTE_PROC_IS_DAEMON) { - /* don't need to do the barrier */ - orte_orted_exit_with_barrier = false; if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { ORTE_ERROR_LOG(ret); } diff --git a/orte/mca/filem/base/filem_base_receive.c b/orte/mca/filem/base/filem_base_receive.c index cc7e8bd6fc..e55fd421b0 100644 --- a/orte/mca/filem/base/filem_base_receive.c +++ b/orte/mca/filem/base/filem_base_receive.c @@ -52,7 +52,7 @@ #include "orte/mca/rml/rml_types.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/filem/filem.h" #include "orte/mca/filem/base/base.h" @@ -195,7 +195,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende if (NULL == (jdata = orte_get_job_data_object(name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_UPDATE_EXIT_STATUS(1); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); goto CLEANUP; } /* get the proc object for it */ @@ -203,7 +203,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_UPDATE_EXIT_STATUS(1); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); goto CLEANUP; } @@ -213,7 +213,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); ORTE_UPDATE_EXIT_STATUS(1); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); goto CLEANUP; } @@ -299,13 +299,13 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender, if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); ORTE_UPDATE_EXIT_STATUS(1); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); ORTE_UPDATE_EXIT_STATUS(1); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); goto CLEANUP; } diff --git a/orte/mca/grpcomm/bad/grpcomm_bad_module.c b/orte/mca/grpcomm/bad/grpcomm_bad_module.c index 91ac14099e..64eb7968f3 100644 --- a/orte/mca/grpcomm/bad/grpcomm_bad_module.c +++ b/orte/mca/grpcomm/bad/grpcomm_bad_module.c @@ -51,7 +51,6 @@ static int xcast(orte_jobid_t job, orte_rml_tag_t tag); static int bad_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); static int bad_barrier(void); -static int bad_onesided_barrier(void); static int modex(opal_list_t *procs); /* Module def */ @@ -62,7 +61,6 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { bad_allgather, orte_grpcomm_base_allgather_list, bad_barrier, - bad_onesided_barrier, orte_grpcomm_base_set_proc_attr, orte_grpcomm_base_get_proc_attr, modex, @@ -70,7 +68,7 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { }; /* Local variables */ -static orte_grpcomm_collective_t barrier, allgather, onesided_barrier; +static orte_grpcomm_collective_t barrier, allgather; /** * Initialize the module @@ -87,7 +85,6 @@ static int init(void) /* setup global variables */ OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t); OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t); - OBJ_CONSTRUCT(&onesided_barrier, orte_grpcomm_collective_t); /* if we are a daemon or the hnp, we need to post a * recv to catch any collective operations @@ -115,7 +112,6 @@ static void finalize(void) /* destruct the globals */ OBJ_DESTRUCT(&barrier); OBJ_DESTRUCT(&allgather); - OBJ_DESTRUCT(&onesided_barrier); /* if we are a daemon or the hnp, we need to cancel the * recv we posted @@ -229,124 +225,6 @@ static int bad_barrier(void) return rc; } -static void onesided_barrier_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata; - - OPAL_THREAD_LOCK(&coll->lock); - /* flag as recvd */ - coll->recvd += 1; - if (orte_process_info.num_procs == coll->recvd) { - opal_condition_broadcast(&coll->cond); - } - OPAL_THREAD_UNLOCK(&coll->lock); -} - -/* quick timeout loop */ -static bool timer_fired; - -static void quicktime_cb(int fd, short event, void *cbdata) -{ - /* declare it fired */ - timer_fired = true; -} - -static int bad_onesided_barrier(void) -{ - opal_list_t daemon_tree; - opal_list_item_t *item; - opal_buffer_t buf; - orte_process_name_t my_parent; - opal_event_t *quicktime=NULL; - struct timeval quicktimeval; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:bad: onesided barrier called", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if we are not to use the barrier, then just return */ - if (!orte_orted_exit_with_barrier) { - if (ORTE_PROC_IS_HNP) { - /* if we are the HNP, we need to do a little delay to give - * the orteds a chance to exit before we leave - */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:bad: onesided barrier adding delay timer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - quicktimeval.tv_sec = 0; - quicktimeval.tv_usec = 100; - timer_fired = false; - ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - } - return ORTE_SUCCESS; - } - - /* figure out how many participants we should be expecting */ - OBJ_CONSTRUCT(&daemon_tree, opal_list_t); - my_parent.jobid = ORTE_PROC_MY_NAME->jobid; - my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree); - OPAL_THREAD_LOCK(&onesided_barrier.lock); - onesided_barrier.recvd += orte_process_info.num_procs - opal_list_get_size(&daemon_tree); - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:bad: onesided barrier num_participating %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)(orte_process_info.num_procs - opal_list_get_size(&daemon_tree)))); - - /* disassemble the daemon tree */ - while (NULL != (item = opal_list_remove_first(&daemon_tree))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&daemon_tree); - - /* set the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_ONESIDED_BARRIER, - ORTE_RML_PERSISTENT, - onesided_barrier_recv, - &onesided_barrier))) { - ORTE_ERROR_LOG(rc); - } - - /* wait to get all my inputs */ - OPAL_THREAD_LOCK(&onesided_barrier.lock); - while (onesided_barrier.recvd < orte_process_info.num_procs) { - opal_condition_wait(&onesided_barrier.cond, &onesided_barrier.lock); - } - /* reset the collective */ - onesided_barrier.recvd = 0; - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - - /* cancel the recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER); - - /* if I am the HNP, then we are done */ - if (ORTE_PROC_IS_HNP) { - return ORTE_SUCCESS; - } - - /* send a zero-byte msg to my parent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* send it */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:bad:onsided:barrier not the HNP - sending to parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&my_parent))); - if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - return ORTE_SUCCESS; -} - static void allgather_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) diff --git a/orte/mca/grpcomm/basic/grpcomm_basic_module.c b/orte/mca/grpcomm/basic/grpcomm_basic_module.c index 9a55569770..fa997593f7 100644 --- a/orte/mca/grpcomm/basic/grpcomm_basic_module.c +++ b/orte/mca/grpcomm/basic/grpcomm_basic_module.c @@ -55,7 +55,6 @@ static int xcast(orte_jobid_t job, orte_rml_tag_t tag); static int basic_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); static int basic_barrier(void); -static int basic_onesided_barrier(void); static int modex(opal_list_t *procs); static int set_proc_attr(const char *attr_name, const void *data, size_t size); static int get_proc_attr(const orte_process_name_t proc, @@ -70,7 +69,6 @@ orte_grpcomm_base_module_t orte_grpcomm_basic_module = { basic_allgather, orte_grpcomm_base_allgather_list, basic_barrier, - basic_onesided_barrier, set_proc_attr, get_proc_attr, modex, @@ -78,7 +76,7 @@ orte_grpcomm_base_module_t orte_grpcomm_basic_module = { }; /* Local variables */ -static orte_grpcomm_collective_t barrier, allgather, onesided_barrier; +static orte_grpcomm_collective_t barrier, allgather; static bool recv_on; static opal_buffer_t *profile_buf=NULL; @@ -118,7 +116,6 @@ static int init(void) /* setup global variables */ OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t); OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t); - OBJ_CONSTRUCT(&onesided_barrier, orte_grpcomm_collective_t); if (ORTE_PROC_IS_HNP && recv_on) { /* open the profile file for writing */ @@ -186,7 +183,6 @@ static void finalize(void) /* destruct the globals */ OBJ_DESTRUCT(&barrier); OBJ_DESTRUCT(&allgather); - OBJ_DESTRUCT(&onesided_barrier); if (ORTE_PROC_IS_HNP && recv_on) { /* if we are profiling and I am the HNP, then stop the @@ -311,124 +307,6 @@ static int basic_barrier(void) return rc; } -static void onesided_barrier_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata; - - OPAL_THREAD_LOCK(&coll->lock); - /* flag as recvd */ - coll->recvd += 1; - if (orte_process_info.num_procs == coll->recvd) { - opal_condition_broadcast(&coll->cond); - } - OPAL_THREAD_UNLOCK(&coll->lock); -} -/* quick timeout loop */ -static bool timer_fired; - -static void quicktime_cb(int fd, short event, void *cbdata) -{ - /* declare it fired */ - timer_fired = true; -} - -static int basic_onesided_barrier(void) -{ - opal_list_t daemon_tree; - opal_list_item_t *item; - opal_buffer_t buf; - orte_process_name_t my_parent; - opal_event_t *quicktime=NULL; - struct timeval quicktimeval; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:basic: onesided barrier called", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if we are not to use the barrier, then just return */ - if (!orte_orted_exit_with_barrier) { - if (ORTE_PROC_IS_HNP) { - /* if we are the HNP, we need to do a little delay to give - * the orteds a chance to exit before we leave - */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:basic: onesided barrier adding delay timer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - quicktimeval.tv_sec = 0; - quicktimeval.tv_usec = 100; - timer_fired = false; - ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - } - return ORTE_SUCCESS; - } - - /* figure out how many participants we should be expecting */ - OBJ_CONSTRUCT(&daemon_tree, opal_list_t); - my_parent.jobid = ORTE_PROC_MY_NAME->jobid; - my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree); - OPAL_THREAD_LOCK(&onesided_barrier.lock); - onesided_barrier.recvd += orte_process_info.num_procs - opal_list_get_size(&daemon_tree); - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:basic: onesided barrier num_participating %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)(orte_process_info.num_procs - opal_list_get_size(&daemon_tree)))); - - /* disassemble the daemon tree */ - while (NULL != (item = opal_list_remove_first(&daemon_tree))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&daemon_tree); - - /* set the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_ONESIDED_BARRIER, - ORTE_RML_PERSISTENT, - onesided_barrier_recv, - &onesided_barrier))) { - ORTE_ERROR_LOG(rc); - } - - /* wait to get all my inputs */ - OPAL_THREAD_LOCK(&onesided_barrier.lock); - while (onesided_barrier.recvd < orte_process_info.num_procs) { - opal_condition_wait(&onesided_barrier.cond, &onesided_barrier.lock); - } - /* reset the collective */ - onesided_barrier.recvd = 0; - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - - /* cancel the recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER); - - /* if I am the HNP, then we are done */ - if (ORTE_PROC_IS_HNP) { - return ORTE_SUCCESS; - } - - /* send a zero-byte msg to my parent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* send it */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:basic:onsided:barrier not the HNP - sending to parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&my_parent))); - if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - return ORTE_SUCCESS; -} - - static void allgather_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) diff --git a/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c b/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c index 4b64eb7c9d..0a73964799 100644 --- a/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c +++ b/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c @@ -75,7 +75,6 @@ orte_grpcomm_base_module_t orte_grpcomm_cnos_module = { allgather, allgather_list, orte_grpcomm_cnos_barrier, - orte_grpcomm_cnos_barrier, set_proc_attr, get_proc_attr, modex, diff --git a/orte/mca/grpcomm/grpcomm.h b/orte/mca/grpcomm/grpcomm.h index d933e4fe49..9d6b4ac9f1 100644 --- a/orte/mca/grpcomm/grpcomm.h +++ b/orte/mca/grpcomm/grpcomm.h @@ -71,11 +71,6 @@ typedef int (*orte_grpcomm_base_module_allgather_list_fn_t)(opal_list_t *names, /* barrier function */ typedef int (*orte_grpcomm_base_module_barrier_fn_t)(void); -/* one-sided barrier function - process releases once its - * contribution is complete - */ -typedef int (*orte_grpcomm_base_module_onesided_barrier_fn_t)(void); - /** DATA EXCHANGE FUNCTIONS - SEE ompi/runtime/ompi_module_exchange.h FOR A DESCRIPTION * OF HOW THIS ALL WORKS @@ -108,7 +103,6 @@ struct orte_grpcomm_base_module_2_0_0_t { orte_grpcomm_base_module_allgather_fn_t allgather; orte_grpcomm_base_module_allgather_list_fn_t allgather_list; orte_grpcomm_base_module_barrier_fn_t barrier; - orte_grpcomm_base_module_onesided_barrier_fn_t onesided_barrier; /* modex functions */ orte_grpcomm_base_module_modex_set_proc_attr_fn_t set_proc_attr; orte_grpcomm_base_module_modex_get_proc_attr_fn_t get_proc_attr; diff --git a/orte/mca/grpcomm/hier/grpcomm_hier_module.c b/orte/mca/grpcomm/hier/grpcomm_hier_module.c index 76d4f17653..66a14347b9 100644 --- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c +++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c @@ -68,7 +68,6 @@ orte_grpcomm_base_module_t orte_grpcomm_hier_module = { hier_allgather, orte_grpcomm_base_allgather_list, hier_barrier, - NULL, /* onesided barrier only used by daemons */ set_proc_attr, get_proc_attr, modex, diff --git a/orte/mca/grpcomm/mcast/grpcomm_mcast.c b/orte/mca/grpcomm/mcast/grpcomm_mcast.c index f970eefa89..caa50c842f 100644 --- a/orte/mca/grpcomm/mcast/grpcomm_mcast.c +++ b/orte/mca/grpcomm/mcast/grpcomm_mcast.c @@ -48,7 +48,6 @@ static int xcast(orte_jobid_t job, orte_rml_tag_t tag); static int mcast_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); static int mcast_barrier(void); -static int mcast_onesided_barrier(void); static int modex(opal_list_t *procs); static int get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, @@ -62,7 +61,6 @@ orte_grpcomm_base_module_t orte_grpcomm_mcast_module = { mcast_allgather, orte_grpcomm_base_allgather_list, mcast_barrier, - mcast_onesided_barrier, orte_grpcomm_base_set_proc_attr, get_proc_attr, modex, @@ -77,7 +75,7 @@ static void daemon_recv(int status, opal_buffer_t *buf, void* cbdata); /* Local variables */ -static orte_grpcomm_collective_t barrier, allgather, onesided_barrier; +static orte_grpcomm_collective_t barrier, allgather; /** * Initialize the module @@ -93,7 +91,6 @@ static int init(void) /* setup global variables */ OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t); OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t); - OBJ_CONSTRUCT(&onesided_barrier, orte_grpcomm_collective_t); /* point to our collective function */ orte_grpcomm_base.daemon_coll = orte_grpcomm_mcast_daemon_coll; @@ -130,7 +127,6 @@ static void finalize(void) /* destruct the globals */ OBJ_DESTRUCT(&barrier); OBJ_DESTRUCT(&allgather); - OBJ_DESTRUCT(&onesided_barrier); } /** @@ -285,73 +281,6 @@ static int mcast_barrier(void) return rc; } - -/* quick timeout loop */ -static bool timer_fired; - -static void quicktime_cb(int fd, short event, void *cbdata) -{ - /* declare it fired */ - timer_fired = true; -} - -static int mcast_onesided_barrier(void) -{ - opal_event_t *quicktime=NULL; - struct timeval quicktimeval; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:mcast: onesided barrier called", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if I am alone, just return */ - if (1 == orte_process_info.num_procs) { - return ORTE_SUCCESS; - } - - /* if we are not to use the barrier, then just return */ - if (!orte_orted_exit_with_barrier) { - if (ORTE_PROC_IS_HNP) { - /* if we are the HNP, we need to do a little delay to give - * the orteds a chance to exit before we leave - */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:mcast: onesided barrier adding delay timer", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - quicktimeval.tv_sec = 0; - quicktimeval.tv_usec = 100; - timer_fired = false; - ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - } - return ORTE_SUCCESS; - } - - /* if we are not the HNP, just send and leave */ - if (!ORTE_PROC_IS_HNP) { - if (ORTE_SUCCESS != (rc = xcast(ORTE_PROC_MY_NAME->jobid, NULL, ORTE_RML_TAG_ONESIDED_BARRIER))) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* initialize things */ - OPAL_THREAD_LOCK(&onesided_barrier.lock); - onesided_barrier.recvd += 1; /* account for me */ - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - - /* wait to complete */ - OPAL_THREAD_LOCK(&onesided_barrier.lock); - while (orte_process_info.num_procs <= onesided_barrier.recvd) { - opal_condition_wait(&onesided_barrier.cond, &onesided_barrier.lock); - } - /* reset the collective */ - onesided_barrier.recvd = 0; - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - return ORTE_SUCCESS; -} - static void allgather_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) @@ -551,16 +480,6 @@ static void daemon_recv(int status, ORTE_MESSAGE_EVENT(sender, buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); break; - case ORTE_RML_TAG_ONESIDED_BARRIER: - OPAL_THREAD_LOCK(&onesided_barrier.lock); - onesided_barrier.recvd += 1; - /* check for completion */ - if (orte_process_info.num_procs <= onesided_barrier.recvd) { - opal_condition_broadcast(&onesided_barrier.cond); - } - OPAL_THREAD_UNLOCK(&onesided_barrier.lock); - break; - case ORTE_RML_TAG_BARRIER: OPAL_THREAD_LOCK(&barrier.lock); /* the recv is the trigger */ diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index b9e0f9e1e6..99f0488d6b 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. * $COPYRIGHT$ @@ -57,7 +57,7 @@ #include "orte/runtime/orte_globals.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_locks.h" -#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_quit.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" #include "orte/util/proc_info.h" @@ -149,7 +149,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata) if (NULL == crud) { orte_never_launched = true; ORTE_UPDATE_EXIT_STATUS(0); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); return ORTE_ERROR; } orte_util_nidmap_init(NULL); @@ -173,7 +173,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata) free(crud); orte_never_launched = true; ORTE_UPDATE_EXIT_STATUS(0); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); return ORTE_ERROR; } @@ -198,7 +198,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata) if (orte_do_not_launch) { orte_never_launched = true; ORTE_UPDATE_EXIT_STATUS(0); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); return ORTE_ERR_SILENT; } @@ -214,7 +214,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ORTE_VPID_PRINT(jdata->num_procs)); orte_never_launched = true; ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); return ORTE_ERROR; } diff --git a/orte/mca/plm/base/plm_base_receive.c b/orte/mca/plm/base/plm_base_receive.c index 79238765f9..02c9cb7376 100644 --- a/orte/mca/plm/base/plm_base_receive.c +++ b/orte/mca/plm/base/plm_base_receive.c @@ -49,7 +49,7 @@ #include "orte/mca/ras/base/base.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/plm/plm.h" @@ -509,7 +509,7 @@ static void process_msg(int fd, short event, void *data) /* see if an error occurred - if so, wakeup the HNP so we can exit */ if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) { - orte_trigger_event(&orte_exit); + orte_jobs_complete(); } } diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index 8b48a7f529..342405a331 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -114,9 +114,6 @@ static int plm_ccp_init(void) ORTE_ERROR_LOG(rc); } - /* we don't need a barrier to exit */ - orte_orted_exit_with_barrier = false; - return rc; } diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index 9fa4a88f59..2064a109a0 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -227,9 +227,6 @@ int orte_plm_process_init(void) SecureZeroMemory(user_name, sizeof(user_name)); SecureZeroMemory(user_password, sizeof(user_password)); - /* we don't need a barrier to exit */ - orte_orted_exit_with_barrier = false; - return rc; } diff --git a/orte/mca/plm/rshd/plm_rshd_module.c b/orte/mca/plm/rshd/plm_rshd_module.c index 78305cbe98..c7aa1e6c08 100644 --- a/orte/mca/plm/rshd/plm_rshd_module.c +++ b/orte/mca/plm/rshd/plm_rshd_module.c @@ -72,6 +72,7 @@ #include "orte/util/show_help.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_quit.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" #include "orte/util/proc_info.h" @@ -396,7 +397,7 @@ int orte_plm_rshd_terminate_job(orte_jobid_t jobid) */ int orte_plm_rshd_terminate_orteds(void) { - orte_trigger_event(&orteds_exit); + orte_quit(); return ORTE_SUCCESS; } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 4593858bdb..c37dae9082 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -67,6 +67,7 @@ #include "orte/util/regex.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" @@ -129,9 +130,6 @@ static int plm_slurm_init(void) local_launch_available = true; } - /* we don't need a barrier to exit */ - orte_orted_exit_with_barrier = false; - return rc; } @@ -522,7 +520,7 @@ static int plm_slurm_terminate_orteds(void) jdata->state = ORTE_JOB_STATE_TERMINATED; /* need to set the #terminated value to avoid an incorrect error msg */ jdata->num_terminated = jdata->num_procs; - orte_trigger_event(&orteds_exit); + orte_quit(); } return rc; @@ -615,7 +613,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){ jdata->state = ORTE_JOB_STATE_TERMINATED; /* need to set the #terminated value to avoid an incorrect error msg */ jdata->num_terminated = jdata->num_procs; - orte_trigger_event(&orteds_exit); + orte_quit(); } } } diff --git a/orte/mca/plm/tmd/plm_tmd_module.c b/orte/mca/plm/tmd/plm_tmd_module.c index e113f1731b..2102f48626 100644 --- a/orte/mca/plm/tmd/plm_tmd_module.c +++ b/orte/mca/plm/tmd/plm_tmd_module.c @@ -68,6 +68,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/plm_private.h" @@ -716,7 +717,7 @@ int plm_tmd_terminate_orteds(void) } else { jdata->state = ORTE_JOB_STATE_TERMINATED; } - orte_trigger_event(&orteds_exit); + orte_quit(); return rc; } diff --git a/orte/mca/ras/base/ras_base_allocate.c b/orte/mca/ras/base/ras_base_allocate.c index 95c0889059..d0736b8901 100644 --- a/orte/mca/ras/base/ras_base_allocate.c +++ b/orte/mca/ras/base/ras_base_allocate.c @@ -41,6 +41,7 @@ #include "orte/util/dash_host/dash_host.h" #include "orte/util/proc_info.h" #include "orte/util/comm/comm.h" +#include "orte/runtime/orte_quit.h" #include "orte/mca/ras/base/ras_private.h" @@ -169,7 +170,7 @@ int orte_ras_base_allocate(orte_job_t *jdata) OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_trigger_event(&orte_exit); + orte_jobs_complete(); return ORTE_ERROR; } diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index e11d88f7bf..d0edb2d723 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -164,30 +164,27 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t); /* profile data */ #define ORTE_RML_TAG_GRPCOMM_PROFILE 33 -/* onesided barrier */ -#define ORTE_RML_TAG_ONESIDED_BARRIER 34 - /* bootstrap */ -#define ORTE_RML_TAG_BOOTSTRAP 35 +#define ORTE_RML_TAG_BOOTSTRAP 34 /* TCP "fake" multicast */ -#define ORTE_RML_TAG_MULTICAST 36 +#define ORTE_RML_TAG_MULTICAST 35 /* multicast messages sent direct */ -#define ORTE_RML_TAG_MULTICAST_DIRECT 37 +#define ORTE_RML_TAG_MULTICAST_DIRECT 36 /* multicast messages to be relayed */ -#define ORTE_RML_TAG_MULTICAST_RELAY 38 +#define ORTE_RML_TAG_MULTICAST_RELAY 37 /* tag for receiving ack of abort msg */ -#define ORTE_RML_TAG_ABORT 39 +#define ORTE_RML_TAG_ABORT 38 /* tag for receiving heartbeats */ -#define ORTE_RML_TAG_HEARTBEAT 40 +#define ORTE_RML_TAG_HEARTBEAT 39 /* notifier data */ -#define ORTE_RML_TAG_NOTIFIER_HNP 41 +#define ORTE_RML_TAG_NOTIFIER_HNP 40 /* comm leader failed */ -#define ORTE_RML_TAG_LEADER 42 +#define ORTE_RML_TAG_LEADER 41 #define ORTE_RML_TAG_MAX 100 diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 8fcba25ed7..1ae0481fa2 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -46,6 +46,7 @@ static int update_routing_tree(void); static orte_vpid_t get_routing_tree(opal_list_t *children); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); +static size_t num_routes(void); #if OPAL_ENABLE_FT_CR == 1 static int binomial_ft_event(int state); @@ -64,6 +65,7 @@ orte_routed_module_t orte_routed_binomial_module = { update_routing_tree, get_routing_tree, get_wireup_info, + num_routes, #if OPAL_ENABLE_FT_CR == 1 binomial_ft_event #else @@ -698,6 +700,14 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) static int route_lost(const orte_process_name_t *route) { + opal_list_item_t *item; + orte_routed_tree_t *child; + + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s route to %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(route))); + /* if we lose the connection to the lifeline and we are NOT already, * in finalize, tell the OOB to abort. * NOTE: we cannot call abort from here as the OOB needs to first @@ -712,6 +722,23 @@ static int route_lost(const orte_process_name_t *route) return ORTE_ERR_FATAL; } + /* if we are the HNP or a daemon, is it a daemon, and one of my children? if so, then + * remove it from the child list + */ + if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) && + route->jobid == ORTE_PROC_MY_NAME->jobid) { + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + if (child->vpid == route->vpid) { + opal_list_remove_item(&my_children, item); + OBJ_RELEASE(item); + return ORTE_SUCCESS; + } + } + } + /* we don't care about this one, so return success */ return ORTE_SUCCESS; } @@ -904,6 +931,14 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_SUCCESS; } +static size_t num_routes(void) +{ + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s num routes %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int)opal_list_get_size(&my_children))); + return opal_list_get_size(&my_children); +} #if OPAL_ENABLE_FT_CR == 1 static int binomial_ft_event(int state) diff --git a/orte/mca/routed/cm/routed_cm.c b/orte/mca/routed/cm/routed_cm.c index 12cba80026..8ef9024a7a 100644 --- a/orte/mca/routed/cm/routed_cm.c +++ b/orte/mca/routed/cm/routed_cm.c @@ -49,6 +49,7 @@ static int update_routing_tree(void); static orte_vpid_t get_routing_tree(opal_list_t *children); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); +static size_t num_routes(void); #if OPAL_ENABLE_FT_CR == 1 static int cm_ft_event(int state); @@ -67,6 +68,7 @@ orte_routed_module_t orte_routed_cm_module = { update_routing_tree, get_routing_tree, get_wireup_info, + num_routes, #if OPAL_ENABLE_FT_CR == 1 cm_ft_event #else @@ -734,22 +736,8 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) static int route_lost(const orte_process_name_t *route) { - /* if we are the HNP and lose a route, check to see if it is - * to a daemon - */ if (ORTE_PROC_IS_HNP) { - if (ORTE_PROC_MY_NAME->jobid == route->jobid) { - /* this was a daemon - notify the errmgr - * so we can take appropriate recovery, if desired - */ - opal_output(0, "%s routed:cm: daemon %s has died", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_VPID_PRINT(route->vpid)); - orte_errmgr.update_state(route->jobid, ORTE_JOB_STATE_COMM_FAILED, - (orte_process_name_t*)route, - ORTE_PROC_STATE_COMM_FAILED, 0, 1); - } - /* either way, take no further action */ + /* take no further action */ return ORTE_SUCCESS; } @@ -890,6 +878,23 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_SUCCESS; } +static size_t num_routes(void) +{ + orte_job_t *jdata; + + if (!ORTE_PROC_IS_HNP) { + return 0; + } + + /* if I am the HNP, then the number of routes is + * the number of daemons (other than me) still alive + */ + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return 0; + } + return (jdata->num_procs - jdata->num_terminated - 1); +} #if OPAL_ENABLE_FT_CR == 1 static int cm_ft_event(int state) diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index 6f0e9cc6dc..79fa68304d 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -40,6 +40,7 @@ static int update_routing_tree(void); static orte_vpid_t get_routing_tree(opal_list_t *children); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); +static size_t num_routes(void); #if OPAL_ENABLE_FT_CR == 1 static int direct_ft_event(int state); @@ -58,6 +59,7 @@ orte_routed_module_t orte_routed_direct_module = { update_routing_tree, get_routing_tree, get_wireup_info, + num_routes, #if OPAL_ENABLE_FT_CR == 1 direct_ft_event #else @@ -336,6 +338,24 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_SUCCESS; } +static size_t num_routes(void) +{ + orte_job_t *jdata; + + if (!ORTE_PROC_IS_HNP) { + return 0; + } + + /* if I am the HNP, then the number of routes is + * the number of daemons still alive (other than me) + */ + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return 0; + } + + return (jdata->num_procs - jdata->num_terminated - 1); +} #if OPAL_ENABLE_FT_CR == 1 static int direct_ft_event(int state) diff --git a/orte/mca/routed/linear/routed_linear.c b/orte/mca/routed/linear/routed_linear.c index 3678372d35..70e930b016 100644 --- a/orte/mca/routed/linear/routed_linear.c +++ b/orte/mca/routed/linear/routed_linear.c @@ -45,6 +45,7 @@ static int update_routing_tree(void); static orte_vpid_t get_routing_tree(opal_list_t *children); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); +static size_t num_routes(void); #if OPAL_ENABLE_FT_CR == 1 static int linear_ft_event(int state); @@ -63,6 +64,7 @@ orte_routed_module_t orte_routed_linear_module = { update_routing_tree, get_routing_tree, get_wireup_info, + num_routes, #if OPAL_ENABLE_FT_CR == 1 linear_ft_event #else @@ -777,6 +779,11 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_SUCCESS; } +static size_t num_routes(void) +{ + return 0; +} + #if OPAL_ENABLE_FT_CR == 1 static int linear_ft_event(int state) { diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index f4f95f9a30..6ada5adbd9 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -46,6 +46,7 @@ static int update_routing_tree(void); static orte_vpid_t get_routing_tree(opal_list_t *children); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); +static size_t num_routes(void); #if OPAL_ENABLE_FT_CR == 1 static int radix_ft_event(int state); @@ -64,6 +65,7 @@ orte_routed_module_t orte_routed_radix_module = { update_routing_tree, get_routing_tree, get_wireup_info, + num_routes, #if OPAL_ENABLE_FT_CR == 1 radix_ft_event #else @@ -687,6 +689,9 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) static int route_lost(const orte_process_name_t *route) { + opal_list_item_t *item; + orte_routed_tree_t *child; + /* if we lose the connection to the lifeline and we are NOT already, * in finalize, tell the OOB to abort. * NOTE: we cannot call abort from here as the OOB needs to first @@ -701,6 +706,23 @@ static int route_lost(const orte_process_name_t *route) return ORTE_ERR_FATAL; } + /* if we are the HNP or daemon, and the route is a daemon, + * see if it is one of our children - if so, remove it + */ + if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) && + route->jobid == ORTE_PROC_MY_NAME->jobid) { + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + if (child->vpid == route->vpid) { + opal_list_remove_item(&my_children, item); + OBJ_RELEASE(item); + return ORTE_SUCCESS; + } + } + } + /* we don't care about this one, so return success */ return ORTE_SUCCESS; } @@ -896,6 +918,11 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_SUCCESS; } +static size_t num_routes(void) +{ + return opal_list_get_size(&my_children); +} + #if OPAL_ENABLE_FT_CR == 1 static int radix_ft_event(int state) { diff --git a/orte/mca/routed/routed.h b/orte/mca/routed/routed.h index b75aca3ef3..da2fff8ba6 100644 --- a/orte/mca/routed/routed.h +++ b/orte/mca/routed/routed.h @@ -215,6 +215,13 @@ typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *chi */ typedef int (*orte_routed_module_set_lifeline_fn_t)(orte_process_name_t *proc); +/* + * Get the number of routes supported by this process + * + * Returns the size of the routing tree using an O(1) function + */ +typedef size_t (*orte_routed_module_num_routes_fn_t)(void); + /** * Handle fault tolerance updates * @@ -251,6 +258,7 @@ struct orte_routed_module_t { orte_routed_module_update_routing_tree_fn_t update_routing_tree; orte_routed_module_get_routing_tree_fn_t get_routing_tree; orte_routed_module_get_wireup_info_fn_t get_wireup_info; + orte_routed_module_num_routes_fn_t num_routes; /* FT Notification */ orte_routed_module_ft_event_fn_t ft_event; }; diff --git a/orte/mca/routed/slave/routed_slave.c b/orte/mca/routed/slave/routed_slave.c index 704ab6201e..614943ac4e 100644 --- a/orte/mca/routed/slave/routed_slave.c +++ b/orte/mca/routed/slave/routed_slave.c @@ -41,6 +41,7 @@ static int update_routing_tree(void); static orte_vpid_t get_routing_tree(opal_list_t *children); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); +static size_t num_routes(void); #if OPAL_ENABLE_FT_CR == 1 static int slave_ft_event(int state); @@ -59,6 +60,7 @@ orte_routed_module_t orte_routed_slave_module = { update_routing_tree, get_routing_tree, get_wireup_info, + num_routes, #if OPAL_ENABLE_FT_CR == 1 slave_ft_event #else @@ -288,6 +290,10 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_ERR_NOT_SUPPORTED; } +static size_t num_routes(void) +{ + return 0; +} #if OPAL_ENABLE_FT_CR == 1 static int slave_ft_event(int state) diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index 3ed4c2dbf3..22245c0c96 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -73,6 +73,7 @@ #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_quit.h" #include "orte/orted/orted.h" @@ -638,16 +639,12 @@ int orte_daemon_process_commands(orte_process_name_t* sender, opal_output(0, "%s orted_cmd: received exit cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } - /* if we are the HNP, just kill our local procs */ - if (ORTE_PROC_IS_HNP) { - orte_odls.kill_local_procs(NULL); - return ORTE_SUCCESS; + /* kill the local procs */ + orte_odls.kill_local_procs(NULL); + /* if all our dependent routes are gone, exit */ + if (0 == orte_routed.num_routes()) { + orte_quit(); } - - /* else we are a daemon, trigger our exit - we will kill our - * local procs on our way out - */ - orte_trigger_event(&orte_exit); return ORTE_SUCCESS; break; @@ -661,7 +658,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender, * NOTE: this event will fire -after- any zero-time events * so any pending relays -do- get sent first */ - orte_trigger_event(&orte_exit); + orte_quit(); return ORTE_SUCCESS; break; diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 67208dfda7..1ee0c1d37e 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -22,6 +22,7 @@ */ #include "orte_config.h" +#include "orte/constants.h" #ifdef HAVE_STRING_H #include @@ -61,13 +62,11 @@ #include "opal/dss/dss.h" #include "opal/mca/sysinfo/sysinfo.h" -#include "orte/constants.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" -#include "orte/runtime/orte_locks.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/errmgr/errmgr.h" @@ -87,30 +86,16 @@ #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_quit.h" #include "orte/orted/orted.h" /* * Globals */ - -static opal_event_t term_handler; -static opal_event_t int_handler; static opal_event_t pipe_handler; -static opal_event_t epipe_handler; -#ifndef __WINDOWS__ -static opal_event_t sigusr1_handler; -static opal_event_t sigusr2_handler; -#endif /* __WINDOWS__ */ -char *log_path = NULL; -static opal_event_t *orted_exit_event; -static bool signals_set=false; - static void shutdown_callback(int fd, short flags, void *arg); -static void shutdown_signal(int fd, short flags, void *arg); -static void signal_callback(int fd, short event, void *arg); -static void epipe_signal_callback(int fd, short flags, void *arg); static struct { bool debug; @@ -221,10 +206,7 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { int orte_daemon(int argc, char *argv[]) { int ret = 0; - int fd; opal_cmd_line_t *cmd_line = NULL; - char log_file[PATH_MAX]; - char *jobidstring; char *rml_uri; int i; opal_buffer_t *buffer; @@ -286,9 +268,6 @@ int orte_daemon(int argc, char *argv[]) exit(1); } - /* setup the exit triggers */ - OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t); - /* save the environment for launch purposes. This MUST be * done so that we can pass it to any local procs we * spawn - otherwise, those local procs won't see any @@ -373,7 +352,7 @@ int orte_daemon(int argc, char *argv[]) * and have it kill us */ if (0 < orted_globals.fail_delay) { - ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_signal); + ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -420,17 +399,6 @@ int orte_daemon(int argc, char *argv[]) ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; } - /* setup an event we can wait for to tell - * us to terminate - both normal and abnormal - * termination will call us here. Use the same exit - * fd as orterun so that orte_comm can wake either of us up - * since we share that code - */ - if (ORTE_SUCCESS != (ret = orte_wait_event(&orted_exit_event, &orte_exit, "orted_shutdown", shutdown_callback))) { - ORTE_ERROR_LOG(ret); - goto DONE; - } - /* setup the primary daemon command receive function */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); @@ -439,69 +407,6 @@ int orte_daemon(int argc, char *argv[]) goto DONE; } -#ifndef __WINDOWS__ - /* setup callback for SIGPIPE */ - opal_signal_set(&epipe_handler, SIGPIPE, - epipe_signal_callback, &epipe_handler); - opal_signal_add(&epipe_handler, NULL); - /* Set signal handlers to catch kill signals so we can properly clean up - * after ourselves. - */ - opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL, - shutdown_signal, NULL); - opal_event_add(&term_handler, NULL); - opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL, - shutdown_signal, NULL); - opal_event_add(&int_handler, NULL); - - /** setup callbacks for signals we should ignore */ - opal_signal_set(&sigusr1_handler, SIGUSR1, - signal_callback, &sigusr1_handler); - opal_signal_add(&sigusr1_handler, NULL); - opal_signal_set(&sigusr2_handler, SIGUSR2, - signal_callback, &sigusr2_handler); - opal_signal_add(&sigusr2_handler, NULL); -#endif /* __WINDOWS__ */ - - signals_set = true; - - /* setup stdout/stderr */ - if (orte_debug_daemons_file_flag) { - /* if we are debugging to a file, then send stdout/stderr to - * the orted log file - */ - - /* get my jobid */ - if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, - ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ret); - goto DONE; - } - - /* define a log file name in the session directory */ - snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", - jobidstring, orte_process_info.nodename); - log_path = opal_os_path(false, - orte_process_info.tmpdir_base, - orte_process_info.top_session_dir, - log_file, - NULL); - - fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); - if (fd < 0) { - /* couldn't open the file for some reason, so - * just connect everything to /dev/null - */ - fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); - } else { - dup2(fd, STDOUT_FILENO); - dup2(fd, STDERR_FILENO); - if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { - close(fd); - } - } - } - /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ @@ -800,41 +705,13 @@ int orte_daemon(int argc, char *argv[]) /* should never get here, but if we do... */ DONE: - if (signals_set) { - /* Release all local signal handlers */ - opal_event_del(&term_handler); - opal_event_del(&int_handler); -#ifndef __WINDOWS__ - opal_signal_del(&sigusr1_handler); - opal_signal_del(&sigusr2_handler); -#endif /* __WINDOWS__ */ - } - - /* cleanup any lingering session directories */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* cleanup the triggers */ - OBJ_DESTRUCT(&orte_exit); - /* Finalize and clean up ourselves */ - orte_finalize(); + orte_quit(); return ret; } -static void shutdown_signal(int fd, short flags, void *arg) -{ - /* trigger the call to shutdown callback to protect - * against race conditions - the trigger event will - * check the one-time lock - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_trigger_event(&orte_exit); -} - static void shutdown_callback(int fd, short flags, void *arg) { - int ret; - if (NULL != arg) { /* it's the singleton pipe... remove that handler */ opal_event_del(&pipe_handler); @@ -844,27 +721,14 @@ static void shutdown_callback(int fd, short flags, void *arg) opal_output(0, "%s orted: finalizing", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } - /* cleanup */ - if (NULL != log_path) { - unlink(log_path); - } - - /* make sure our local procs are dead */ - orte_odls.kill_local_procs(NULL); - - /* whack any lingering session directory files from our jobs */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* cleanup the triggers */ - OBJ_DESTRUCT(&orte_exit); - /* if we were ordered to abort, do so */ if (orted_globals.abort) { opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* do -not- call finalize as this will send a message to the HNP - * indicating clean termination! Instead, just forcibly cleanup - * the local session_dir tree and abort + * indicating clean termination! Instead, just kill our + * local procs, forcibly cleanup the local session_dir tree, and abort */ + orte_odls.kill_local_procs(NULL); orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); abort(); } else if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) { @@ -873,38 +737,10 @@ static void shutdown_callback(int fd, short flags, void *arg) * indicating clean termination! Instead, just forcibly cleanup * the local session_dir tree and exit */ + orte_odls.kill_local_procs(NULL); orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } - if (signals_set) { - /* Release all local signal handlers */ - opal_event_del(&epipe_handler); - opal_event_del(&term_handler); - opal_event_del(&int_handler); -#ifndef __WINDOWS__ - opal_signal_del(&sigusr1_handler); - opal_signal_del(&sigusr2_handler); -#endif /* __WINDOWS__ */ - } - - /* Finalize and clean up ourselves */ - ret = orte_finalize(); - exit(orte_exit_status); -} - -/** - * Deal with sigpipe errors - */ -static void epipe_signal_callback(int fd, short flags, void *arg) -{ - /* for now, we just announce and ignore them */ - opal_output(0, "%s reports a SIGPIPE error on fd %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd); - return; -} - -static void signal_callback(int fd, short event, void *arg) -{ - /* just ignore these signals */ + orte_quit(); } diff --git a/orte/runtime/Makefile.am b/orte/runtime/Makefile.am index 4edebe775b..41c16cc9fb 100644 --- a/orte/runtime/Makefile.am +++ b/orte/runtime/Makefile.am @@ -26,6 +26,7 @@ headers += \ runtime/runtime.h \ runtime/orte_locks.h \ runtime/orte_globals.h \ + runtime/orte_quit.h \ runtime/runtime_internals.h \ runtime/data_type_support/orte_dt_support.h @@ -34,6 +35,7 @@ libopen_rte_la_SOURCES += \ runtime/orte_init.c \ runtime/orte_locks.c \ runtime/orte_globals.c \ + runtime/orte_quit.c \ runtime/data_type_support/orte_dt_compare_fns.c \ runtime/data_type_support/orte_dt_copy_fns.c \ runtime/data_type_support/orte_dt_print_fns.c \ diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index a9bd522a01..f1da6757dd 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -753,6 +753,13 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } + + /* pack the constrain flag */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, + (void*)(&(app_context[i]->constrain)), 1, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + return rc; + } } diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index b2702c66ac..714223db48 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -526,9 +526,10 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d", + asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d\tConstrain: %s", pfx2, src->name, (unsigned long)src->idx, src->app, - pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts); + pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts, + src->constrain ? "TRUE" : "FALSE"); count = opal_argv_count(src->argv); for (i=0; i < count; i++) { diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 8fae0e2435..768b4e5de9 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -831,6 +831,13 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the constrain flag */ + max_n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->constrain, + &max_n, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + return rc; + } } return ORTE_SUCCESS; diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index d73a6883fc..669d541fe8 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -60,6 +60,7 @@ bool orte_do_not_launch = false; bool orted_spin_flag = false; bool orte_daemon_bootstrap = false; char *orte_local_cpu_model = NULL; +char *orte_basename = NULL; /* ORTE OOB port flags */ bool orte_static_ports = false; @@ -89,8 +90,7 @@ bool orte_output_debugger_proctable=false; char *orte_debugger_test_daemon=NULL; bool orte_debugger_test_attach=false; -/* exit triggers and flags */ -orte_trigger_event_t orte_exit, orteds_exit; +/* exit flags */ int orte_exit_status = 0; bool orte_abnormal_term_ordered = false; bool orte_routing_is_enabled = false; @@ -144,9 +144,6 @@ bool orte_forward_job_control; char *orte_rsh_agent = NULL; bool orte_assume_same_shell = true; -/* orted exit with barrier */ -bool orte_orted_exit_with_barrier = true; - /* report launch progress */ bool orte_report_launch_progress = false; @@ -535,6 +532,7 @@ static void orte_app_context_construct(orte_app_context_t* app_context) app_context->used_on_node = false; app_context->max_local_restarts = -1; app_context->max_global_restarts = -1; + app_context->constrain = true; } static void orte_app_context_destructor(orte_app_context_t* app_context) diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index bdbbd12c22..9ceb707894 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -217,6 +217,11 @@ typedef struct { int32_t max_local_restarts; /* max number of times a process can be relocated to another node */ int32_t max_global_restarts; + /* whether or not the procs in this app are constrained to stay + * on the specified nodes when restarted, or can move to any + * known node + */ + bool constrain; } orte_app_context_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t); @@ -287,17 +292,18 @@ typedef struct { ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t); /* define a set of flags to control the launch of a job */ -typedef uint8_t orte_job_controls_t; -#define ORTE_JOB_CONTROL OPAL_UINT8 +typedef uint16_t orte_job_controls_t; +#define ORTE_JOB_CONTROL OPAL_UINT16 -#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x01 -#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x02 -#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x14 -#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x08 -#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x10 -#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20 -#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40 -#define ORTE_JOB_CONTROL_RECOVERABLE 0x80 +#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x0001 +#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002 +#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014 +#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008 +#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010 +#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020 +#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040 +#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080 +#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100 #define ORTE_MAPPING_POLICY OPAL_UINT16 /* put the rank assignment method in the upper 8 bits */ @@ -567,6 +573,7 @@ ORTE_DECLSPEC extern bool orte_do_not_launch; ORTE_DECLSPEC extern bool orted_spin_flag; ORTE_DECLSPEC extern bool orte_daemon_bootstrap; ORTE_DECLSPEC extern char *orte_local_cpu_model; +ORTE_DECLSPEC extern char *orte_basename; /* ORTE OOB port flags */ ORTE_DECLSPEC extern bool orte_static_ports; @@ -596,9 +603,7 @@ ORTE_DECLSPEC extern bool orte_output_debugger_proctable; ORTE_DECLSPEC extern char *orte_debugger_test_daemon; ORTE_DECLSPEC extern bool orte_debugger_test_attach; -/* exit triggers and flags */ -ORTE_DECLSPEC extern orte_trigger_event_t orte_exit; -ORTE_DECLSPEC extern orte_trigger_event_t orteds_exit; +/* exit flags */ ORTE_DECLSPEC extern int orte_exit_status; ORTE_DECLSPEC extern bool orte_abnormal_term_ordered; ORTE_DECLSPEC extern bool orte_routing_is_enabled; @@ -652,9 +657,6 @@ ORTE_DECLSPEC extern char *orte_xterm; ORTE_DECLSPEC extern char *orte_rsh_agent; ORTE_DECLSPEC extern bool orte_assume_same_shell; -/* whether or not to barrier the orteds upon exit */ -ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier; - /* whether or not to report launch progress */ ORTE_DECLSPEC extern bool orte_report_launch_progress; diff --git a/orte/runtime/orte_locks.c b/orte/runtime/orte_locks.c index 17b676dc6f..584f9781c9 100644 --- a/orte/runtime/orte_locks.c +++ b/orte/runtime/orte_locks.c @@ -27,7 +27,8 @@ opal_atomic_lock_t orte_finalize_lock; /* for HNPs */ opal_atomic_lock_t orte_abort_inprogress_lock; - +opal_atomic_lock_t orte_jobs_complete_lock; +opal_atomic_lock_t orte_quit_lock; int orte_locks_init(void) { @@ -36,6 +37,8 @@ int orte_locks_init(void) /* for HNPs */ opal_atomic_init(&orte_abort_inprogress_lock, OPAL_ATOMIC_UNLOCKED); + opal_atomic_init(&orte_jobs_complete_lock, OPAL_ATOMIC_UNLOCKED); + opal_atomic_init(&orte_quit_lock, OPAL_ATOMIC_UNLOCKED); return ORTE_SUCCESS; } diff --git a/orte/runtime/orte_locks.h b/orte/runtime/orte_locks.h index 484663858b..2474de3c2f 100644 --- a/orte/runtime/orte_locks.h +++ b/orte/runtime/orte_locks.h @@ -35,7 +35,8 @@ ORTE_DECLSPEC extern opal_atomic_lock_t orte_finalize_lock; /* for HNPs */ ORTE_DECLSPEC extern opal_atomic_lock_t orte_abort_inprogress_lock; - +ORTE_DECLSPEC extern opal_atomic_lock_t orte_jobs_complete_lock; +ORTE_DECLSPEC extern opal_atomic_lock_t orte_quit_lock; /** * Initialize the locks diff --git a/orte/runtime/orte_quit.c b/orte/runtime/orte_quit.c new file mode 100644 index 0000000000..2c5e208581 --- /dev/null +++ b/orte/runtime/orte_quit.c @@ -0,0 +1,401 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#ifdef HAVE_SYS_WAIT_H +#include +#endif /* HAVE_SYS_WAIT_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif /* HAVE_SYS_TIME_H */ + +#include "orte/mca/plm/plm.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/debugger/debugger.h" +#include "orte/mca/routed/routed.h" + +#include "orte/util/session_dir.h" +#include "orte/util/show_help.h" + +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_quit.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_data_server.h" + +/* + * Globals + */ + +static int num_aborted = 0; +static int num_killed = 0; +static int num_failed_start = 0; + +static void dump_aborted_procs(void); + +void orte_jobs_complete(void) +{ + /* check one-time lock to protect against multiple calls */ + if (!opal_atomic_trylock(&orte_jobs_complete_lock)) { /* returns 1 if already locked */ + return; + } + + /* if we never launched, just skip this part to avoid + * meaningless error messages + */ + if (orte_never_launched) { + ORTE_UPDATE_EXIT_STATUS(orte_exit_status); + orte_quit(); + } + + if (0 != orte_exit_status && !orte_execute_quiet) { + /* abnormal termination of some kind */ + dump_aborted_procs(); + /* If we showed more abort messages than were allowed, + show a followup message here */ + if (num_failed_start > 1) { + if (orte_xml_output) { + fprintf(orte_xml_fp, ""); + } + fprintf(orte_xml_fp, "%d total process%s failed to start", + num_failed_start, ((num_failed_start > 1) ? "es" : "")); + if (orte_xml_output) { + fprintf(orte_xml_fp, " "); + } + fprintf(orte_xml_fp, "\n"); + } + if (num_aborted > 1) { + if (orte_xml_output) { + fprintf(orte_xml_fp, ""); + } + fprintf(orte_xml_fp, "%d total process%s aborted", + num_aborted, ((num_aborted > 1) ? "es" : "")); + if (orte_xml_output) { + fprintf(orte_xml_fp, " "); + } + fprintf(orte_xml_fp, "\n"); + } + if (num_killed > 1) { + if (orte_xml_output) { + fprintf(orte_xml_fp, ""); + } + fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)", + num_killed, ((num_killed > 1) ? "es" : ""), orte_basename); + if (orte_xml_output) { + fprintf(orte_xml_fp, " "); + } + fprintf(orte_xml_fp, "\n"); + } + } + + /* if the debuggers were run, clean up */ + orte_debugger.finalize(); + + if (0 < orte_routed.num_routes()) { + orte_plm.terminate_orteds(); + } +} + +void orte_quit(void) +{ + /* check one-time lock to protect against "bounce" */ + if (!opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */ + return; + } + + /* whack any lingering session directory files from our jobs */ + orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); + + /* cleanup our data server */ + orte_data_server_finalize(); + + /* cleanup and leave */ + orte_finalize(); + + if (NULL != orte_basename) { + free(orte_basename); + } + + if (orte_debug_flag) { + fprintf(stderr, "orterun: exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); +} + + +/* + * On abnormal termination - dump the + * exit status of the aborted procs. + */ + +static void dump_aborted_procs(void) +{ + orte_std_cntr_t i, n; + orte_proc_t *proc, *pptr; + orte_app_context_t *app, *approc; + orte_job_t *job; + orte_node_t *node; + + /* find the job that caused the problem - be sure to start the loop + * at 1 as the daemons are in 0 and will clearly be "running", so no + * point in checking them + */ + for (n=1; n < orte_job_data->size; n++) { + if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { + /* the array is no longer left-justified, so we have to continue */ + continue; + } + if (ORTE_JOB_STATE_UNDEF != job->state && + ORTE_JOB_STATE_INIT != job->state && + ORTE_JOB_STATE_LAUNCHED != job->state && + ORTE_JOB_STATE_RUNNING != job->state && + ORTE_JOB_STATE_TERMINATED != job->state && + ORTE_JOB_STATE_ABORT_ORDERED != job->state) { + /* this is a guilty party */ + proc = job->aborted_proc; + /* always must be at least one app */ + app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, 0); + /* cycle through and count the number that were killed or aborted */ + for (i=0; i < job->procs->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) { + /* array is left-justfied - we are done */ + continue; + } + if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state) { + ++num_failed_start; + } else if (ORTE_PROC_STATE_ABORTED == pptr->state) { + ++num_aborted; + } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) { + ++num_killed; + } + } + approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx); + node = proc->node; + if (ORTE_JOB_STATE_FAILED_TO_START == job->state) { + if (NULL == proc) { + orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true, + orte_basename); + return; + } + switch (OPAL_SOS_GET_ERROR_CODE(proc->exit_code)) { + case ORTE_ERR_SYS_LIMITS_PIPES: + orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true, + orte_basename, proc->node->name, + (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_PIPE_SETUP_FAILURE: + orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true, + orte_basename, proc->node->name, + (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_SYS_LIMITS_CHILDREN: + orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true, + orte_basename, proc->node->name, + (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_FAILED_GET_TERM_ATTRS: + orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true, + orte_basename, proc->node->name, + (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_WDIR_NOT_FOUND: + orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true, + orte_basename, approc->cwd, + proc->node->name, (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_EXE_NOT_FOUND: + orte_show_help("help-orterun.txt", "orterun:exe-not-found", true, + orte_basename, + (unsigned long)proc->name.vpid, + orte_basename, + orte_basename, + proc->node->name, + approc->app); + break; + case ORTE_ERR_EXE_NOT_ACCESSIBLE: + orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true, + orte_basename, approc->app, proc->node->name, + (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_MULTIPLE_AFFINITIES: + orte_show_help("help-orterun.txt", + "orterun:multiple-paffinity-schemes", true, proc->slot_list); + break; + case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: + orte_show_help("help-orterun.txt", + "orterun:topo-not-supported", + true, orte_process_info.nodename, "rankfile containing a slot_list of ", + proc->slot_list, approc->app); + break; + case ORTE_ERR_INVALID_NODE_RANK: + orte_show_help("help-orterun.txt", + "orterun:invalid-node-rank", true); + break; + case ORTE_ERR_INVALID_LOCAL_RANK: + orte_show_help("help-orterun.txt", + "orterun:invalid-local-rank", true); + break; + case ORTE_ERR_NOT_ENOUGH_CORES: + orte_show_help("help-orterun.txt", + "orterun:not-enough-resources", true, + "sockets", node->name, + "bind-to-core", approc->app); + break; + case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED: + orte_show_help("help-orterun.txt", + "orterun:topo-not-supported", + true, node->name, "bind-to-core", "", + approc->app); + break; + case ORTE_ERR_INVALID_PHYS_CPU: + orte_show_help("help-orterun.txt", + "orterun:invalid-phys-cpu", true); + break; + case ORTE_ERR_NOT_ENOUGH_SOCKETS: + orte_show_help("help-orterun.txt", + "orterun:not-enough-resources", true, + "sockets", node->name, + "bind-to-socket", approc->app); + break; + case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED: + orte_show_help("help-orterun.txt", + "orterun:topo-not-supported", + true, node->name, "bind-to-socket", "", + approc->app); + break; + case ORTE_ERR_MODULE_NOT_FOUND: + orte_show_help("help-orterun.txt", + "orterun:paffinity-missing-module", + true, node->name); + break; + case ORTE_ERR_SLOT_LIST_RANGE: + orte_show_help("help-orterun.txt", + "orterun:invalid-slot-list-range", + true, node->name, proc->slot_list); + break; + case ORTE_ERR_PIPE_READ_FAILURE: + orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true, + orte_basename, node->name, (unsigned long)proc->name.vpid); + break; + case ORTE_ERR_SOCKET_NOT_AVAILABLE: + orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true, + orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, + (unsigned long)proc->name.vpid); + break; + + default: + if (0 != proc->exit_code) { + orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, + orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, + (unsigned long)proc->name.vpid); + } else { + orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true, + orte_basename, node->name); + } + break; + } + } else if (ORTE_JOB_STATE_ABORTED == job->state) { + if (NULL == proc) { + orte_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true, + orte_basename); + } else { + orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true, + orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, + node->name, orte_basename); + } + } else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */ + if (NULL == proc) { + orte_show_help("help-orterun.txt", "orterun:proc-aborted-signal-unknown", true, + orte_basename); + } else { +#ifdef HAVE_STRSIGNAL + if (NULL != strsignal(WTERMSIG(proc->exit_code))) { + orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true, + orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, + node->name, WTERMSIG(proc->exit_code), + strsignal(WTERMSIG(proc->exit_code))); + } else { +#endif + orte_show_help("help-orterun.txt", "orterun:proc-aborted", true, + orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, + node->name, WTERMSIG(proc->exit_code)); +#ifdef HAVE_STRSIGNAL + } +#endif + } + } else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */ + if (NULL == proc) { + orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true, + orte_basename, orte_basename); + } else { + orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true, + orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, + node->name, orte_basename, orte_basename); + } + } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) { + orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true, + ORTE_NAME_PRINT(&proc->name), node->name); + } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) { + switch (proc->exit_code) { + case ORTE_ERR_MEM_LIMIT_EXCEEDED: + orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true, + ORTE_NAME_PRINT(&proc->name), node->name); + break; + case ORTE_ERR_PROC_STALLED: + orte_show_help("help-orterun.txt", "orterun:proc-stalled", true); + break; + + default: + orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true); + break; + } + } else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) { + orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true, + orte_basename, + (0 == strncmp("orte", orte_basename, 4)) ? "orte" : "MPI"); + } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) { + orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true, + orte_basename, ORTE_NAME_PRINT(&proc->name), node->name); + } + return; + } + } +} diff --git a/orte/runtime/orte_quit.h b/orte/runtime/orte_quit.h new file mode 100644 index 0000000000..631d6665fa --- /dev/null +++ b/orte/runtime/orte_quit.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef ORTE_QUIT_H +#define ORTE_QUIT_H + +#include "orte_config.h" + +BEGIN_C_DECLS + +ORTE_DECLSPEC void orte_jobs_complete(void); + +ORTE_DECLSPEC void orte_quit(void); + +END_C_DECLS + +#endif /* ORTE_CR_H */ diff --git a/orte/tools/orte-top/orte-top.c b/orte/tools/orte-top/orte-top.c index 67237fc66e..68cca1d042 100644 --- a/orte/tools/orte-top/orte-top.c +++ b/orte/tools/orte-top/orte-top.c @@ -56,6 +56,7 @@ #include "orte/util/proc_info.h" #include "orte/runtime/orte_wait.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/runtime/orte_quit.h" /* * Local variables & functions @@ -68,7 +69,6 @@ static bool all_recvd; static int32_t num_replies; static int32_t num_recvd; static opal_buffer_t cmdbuf; -static opal_event_t *my_exit_event; static FILE *fp = NULL; static bool help; static char *hnppidstr; @@ -181,7 +181,7 @@ static void send_cmd(int fd, short dummy, void *arg) num_recvd = 0; if (0 > (ret = orte_rml.send_buffer(&(target_hnp->name), &cmdbuf, ORTE_RML_TAG_DAEMON, 0))) { ORTE_ERROR_LOG(ret); - orte_trigger_event(&orteds_exit); + orte_quit(); return; } @@ -197,7 +197,7 @@ static void send_cmd(int fd, short dummy, void *arg) if (0 < update_rate) { ORTE_TIMER_EVENT(update_rate, 0, send_cmd); } else { - orte_trigger_event(&orte_exit); + orte_quit(); } } @@ -263,14 +263,7 @@ main(int argc, char *argv[]) return 1; } - OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t); - - if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) { - orte_finalize(); - exit(1); - } - - /* setup the list for recvd stats */ + /* setup the list for recvd stats */ OBJ_CONSTRUCT(&recvd_stats, opal_list_t); /** setup callbacks for abort signals - from this point @@ -567,8 +560,8 @@ static void abort_exit_callback(int fd, short ign, void *arg) if (NULL != fp && fp != stdout) { fclose(fp); } - orte_finalize(); - exit(1); + ORTE_UPDATE_EXIT_STATUS(1); + orte_quit(); } static void process_stats(int fd, short event, void *data) diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 98f9078aea..8b9a90bd3f 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -92,6 +92,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_quit.h" /* ensure I can behave like a daemon */ #include "orte/orted/orted.h" @@ -101,31 +102,13 @@ /* * Globals */ -static struct opal_event term_handler; -static struct opal_event int_handler; -static struct opal_event epipe_handler; -#ifndef __WINDOWS__ -static struct opal_event sigusr1_handler; -static struct opal_event sigusr2_handler; -static struct opal_event sigtstp_handler; -static struct opal_event sigcont_handler; -#endif /* __WINDOWS__ */ static orte_job_t *jdata=NULL; -static char *orterun_basename = NULL; -static int num_aborted = 0; -static int num_killed = 0; -static int num_failed_start = 0; static char **global_mca_env = NULL; static bool have_zero_np = false; static orte_std_cntr_t total_num_apps = 0; static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; -static opal_event_t *orterun_event=NULL, *orteds_exit_event=NULL; static char *ompi_server=NULL; -static opal_event_t *abort_exit_event=NULL; -static bool forcibly_die = false; -static opal_event_t *timeout_ev=NULL; static bool profile_is_set = false; -static bool signals_set=false; /* * Globals @@ -145,7 +128,7 @@ static opal_cmd_line_init_t cmd_line_init[] = { &orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Be verbose" }, { "orte", "execute", "quiet", 'q', NULL, "quiet", 0, - &orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL, + NULL, OPAL_CMD_LINE_TYPE_BOOL, "Suppress helpful messages" }, { NULL, NULL, NULL, '\0', "report-pid", "report-pid", 1, &orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING, @@ -455,20 +438,12 @@ static opal_cmd_line_init_t cmd_line_init[] = { /* * Local functions */ -static void job_completed(int trigpipe, short event, void *arg); -static void abort_signal_callback(int fd, short flags, void *arg); -static void abort_exit_callback(int fd, short event, void *arg); -static void epipe_signal_callback(int fd, short flags, void *arg); -static void signal_forward_callback(int fd, short event, void *arg); static int create_app(int argc, char* argv[], orte_app_context_t **app, bool *made_app, char ***app_env); static int init_globals(void); static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line); static int parse_locals(int argc, char* argv[]); static int parse_appfile(char *filename, char ***env); -static void dump_aborted_procs(void); -static void just_quit(int fd, short ign, void *arg); - static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, int argc, char *argv[], int num_procs); @@ -480,7 +455,7 @@ int orterun(int argc, char *argv[]) /* find our basename (the name of the executable) so that we can use it in pretty-print error messages */ - orterun_basename = opal_basename(argv[0]); + orte_basename = opal_basename(argv[0]); /* Setup and parse the command line */ init_globals(); @@ -525,10 +500,6 @@ int orterun(int argc, char *argv[]) exit(1); } - /* setup the exit triggers */ - OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t); - OBJ_CONSTRUCT(&orteds_exit, orte_trigger_event_t); - /* flag that I am the HNP - needs to be done prior to * registering params */ @@ -582,7 +553,7 @@ int orterun(int argc, char *argv[]) /* This should never happen -- this case should be caught in create_app(), but let's just double check... */ orte_show_help("help-orterun.txt", "orterun:nothing-to-do", - true, orterun_basename); + true, orte_basename); exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } @@ -634,7 +605,7 @@ int orterun(int argc, char *argv[]) fp = fopen(orterun_globals.report_uri, "w"); if (NULL == fp) { orte_show_help("help-orterun.txt", "orterun:write_file", false, - orterun_basename, "uri", orterun_globals.report_uri); + orte_basename, "uri", orterun_globals.report_uri); exit(0); } fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); @@ -655,68 +626,6 @@ int orterun(int argc, char *argv[]) but what the heck... :-) */ opal_progress_set_event_flag(OPAL_EVLOOP_ONCE); - /* setup an event we can wait for that will tell - * us to terminate - both normal and abnormal - * termination will call us here. Use the - * same exit fd as the daemon does so that orted_comm - * can cause either of us to exit since we share that code - */ - if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, "job_complete", job_completed))) { - orte_show_help("help-orterun.txt", "orterun:event-def-failed", true, - orterun_basename, ORTE_ERROR_NAME(rc)); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto DONE; - } - - /* setup an event that will - * trigger when the orteds are gone and tell the orteds that it is - * okay to finalize and exit, we are done with them. - * We set this up here in order to provide a way for us to - * wakeup and terminate should the daemons themselves fail to launch, - * and before we define signal handlers since they will call the - * exit event trigger! - */ - if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, "orted_exit", just_quit))) { - orte_show_help("help-orterun.txt", "orterun:event-def-failed", true, - orterun_basename, ORTE_ERROR_NAME(rc)); - goto DONE; - } - -#ifndef __WINDOWS__ - /* setup callback for SIGPIPE */ - opal_signal_set(&epipe_handler, SIGPIPE, - epipe_signal_callback, &epipe_handler); - opal_signal_add(&epipe_handler, NULL); - /** setup callbacks for abort signals - from this point - * forward, we need to abort in a manner that allows us - * to cleanup - */ - opal_signal_set(&term_handler, SIGTERM, - abort_signal_callback, &term_handler); - opal_signal_add(&term_handler, NULL); - opal_signal_set(&int_handler, SIGINT, - abort_signal_callback, &int_handler); - opal_signal_add(&int_handler, NULL); - - /** setup callbacks for signals we should foward */ - opal_signal_set(&sigusr1_handler, SIGUSR1, - signal_forward_callback, &sigusr1_handler); - opal_signal_add(&sigusr1_handler, NULL); - opal_signal_set(&sigusr2_handler, SIGUSR2, - signal_forward_callback, &sigusr2_handler); - opal_signal_add(&sigusr2_handler, NULL); - if (orte_forward_job_control) { - opal_signal_set(&sigtstp_handler, SIGTSTP, - signal_forward_callback, &sigtstp_handler); - opal_signal_add(&sigtstp_handler, NULL); - opal_signal_set(&sigcont_handler, SIGCONT, - signal_forward_callback, &sigcont_handler); - opal_signal_add(&sigcont_handler, NULL); - } -#endif /* __WINDOWS__ */ - - signals_set = true; - /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables in our copy. This will ensure that any locally-spawned children will @@ -743,7 +652,7 @@ int orterun(int argc, char *argv[]) } opal_setenv("PATH", newenv, true, &orte_launch_environ); if (orte_debug_flag) { - opal_output(0, "%s: reset PATH: %s", orterun_basename, newenv); + opal_output(0, "%s: reset PATH: %s", orte_basename, newenv); } free(newenv); free(bin_base); @@ -760,7 +669,7 @@ int orterun(int argc, char *argv[]) opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ); if (orte_debug_flag) { opal_output(0, "%s: reset LD_LIBRARY_PATH: %s", - orterun_basename, newenv); + orte_basename, newenv); } free(newenv); free(lib_base); @@ -770,7 +679,7 @@ int orterun(int argc, char *argv[]) if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) { ORTE_ERROR_LOG(rc); orte_show_help("help-orterun.txt", "orterun:precondition", false, - orterun_basename, NULL, NULL, rc); + orte_basename, NULL, NULL, rc); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); goto DONE; } @@ -821,7 +730,7 @@ int orterun(int argc, char *argv[]) if (ORTE_SUCCESS != (rc = orte_rml.ping(ompi_server, &timeout))) { /* okay give up */ orte_show_help("help-orterun.txt", "orterun:server-not-found", true, - orterun_basename, ompi_server, + orte_basename, ompi_server, (long)orterun_globals.server_wait_timeout, ORTE_ERROR_NAME(rc)); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); @@ -848,524 +757,10 @@ int orterun(int argc, char *argv[]) */ DONE: ORTE_UPDATE_EXIT_STATUS(orte_exit_status); - just_quit(0,0,NULL); + orte_quit(); return orte_exit_status; } -static void job_completed(int trigpipe, short event, void *arg) -{ - int rc; - orte_job_t *daemons; - - /* if the abort exit event is set, delete it */ - if (NULL != abort_exit_event) { - opal_evtimer_del(abort_exit_event); - free(abort_exit_event); - } - - /* if we never launched, just skip this part to avoid - * meaningless error messages - */ - if (orte_never_launched) { - rc = orte_exit_status; - goto DONE; - } - - if (0 != orte_exit_status && !orterun_globals.quiet) { - /* abnormal termination of some kind */ - dump_aborted_procs(); - /* If we showed more abort messages than were allowed, - show a followup message here */ - if (num_failed_start > 1) { - if (orte_xml_output) { - fprintf(orte_xml_fp, ""); - } - fprintf(orte_xml_fp, "%d total process%s failed to start", - num_failed_start, ((num_failed_start > 1) ? "es" : "")); - if (orte_xml_output) { - fprintf(orte_xml_fp, " "); - } - fprintf(orte_xml_fp, "\n"); - } - if (num_aborted > 1) { - if (orte_xml_output) { - fprintf(orte_xml_fp, ""); - } - fprintf(orte_xml_fp, "%d total process%s aborted", - num_aborted, ((num_aborted > 1) ? "es" : "")); - if (orte_xml_output) { - fprintf(orte_xml_fp, " "); - } - fprintf(orte_xml_fp, "\n"); - } - if (num_killed > 1) { - if (orte_xml_output) { - fprintf(orte_xml_fp, ""); - } - fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)", - num_killed, ((num_killed > 1) ? "es" : ""), orterun_basename); - if (orte_xml_output) { - fprintf(orte_xml_fp, " "); - } - fprintf(orte_xml_fp, "\n"); - } - } - - /* if the debuggers were run, clean up */ - orte_debugger.finalize(); - - if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { - /* since we know that the sends didn't completely go out, - * we know that the barrier will never complete. Add a timeout so - * that those daemons that can respond have a chance to do - * so - */ - /* get the orted job data object */ - if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - /* we are totally hozed */ - goto DONE; - } - ORTE_DETECT_TIMEOUT(&timeout_ev, daemons->num_procs, - orte_timeout_usec_per_proc, - orte_max_timeout, just_quit); - } - - /* ensure all the orteds depart together */ - orte_grpcomm.onesided_barrier(); - -DONE: - ORTE_UPDATE_EXIT_STATUS(rc); - just_quit(0, 0, NULL); -} - -static void just_quit(int fd, short ign, void *arg) -{ - /* if the orted exit event is set, delete it */ - if (NULL != orteds_exit_event) { - opal_evtimer_del(orteds_exit_event); - free(orteds_exit_event); - } - - if (signals_set) { - /* Remove the epipe handler */ - opal_signal_del(&epipe_handler); - /* Remove the TERM and INT signal handlers */ - opal_signal_del(&term_handler); - opal_signal_del(&int_handler); -#ifndef __WINDOWS__ - /** Remove the USR signal handlers */ - opal_signal_del(&sigusr1_handler); - opal_signal_del(&sigusr2_handler); - if (orte_forward_job_control) { - opal_signal_del(&sigtstp_handler); - opal_signal_del(&sigcont_handler); - } -#endif /* __WINDOWS__ */ - signals_set = false; - } - - /* whack any lingering session directory files from our jobs */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* cleanup our data server */ - orte_data_server_finalize(); - - /* cleanup and leave */ - orte_finalize(); - - free(orterun_basename); - if (orte_debug_flag) { - fprintf(stderr, "orterun: exiting with status %d\n", orte_exit_status); - } - exit(orte_exit_status); -} - - -/* - * On abnormal termination - dump the - * exit status of the aborted procs. - */ - -static void dump_aborted_procs(void) -{ - orte_std_cntr_t i, n; - orte_proc_t *proc, *pptr; - orte_app_context_t *app, *approc; - orte_job_t *job; - orte_node_t *node; - - /* find the job that caused the problem - be sure to start the loop - * at 1 as the daemons are in 0 and will clearly be "running", so no - * point in checking them - */ - for (n=1; n < orte_job_data->size; n++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { - /* the array is no longer left-justified, so we have to continue */ - continue; - } - if (ORTE_JOB_STATE_UNDEF != job->state && - ORTE_JOB_STATE_INIT != job->state && - ORTE_JOB_STATE_LAUNCHED != job->state && - ORTE_JOB_STATE_RUNNING != job->state && - ORTE_JOB_STATE_TERMINATED != job->state && - ORTE_JOB_STATE_ABORT_ORDERED != job->state) { - /* this is a guilty party */ - proc = job->aborted_proc; - /* always must be at least one app */ - app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, 0); - /* cycle through and count the number that were killed or aborted */ - for (i=0; i < job->procs->size; i++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) { - /* array is left-justfied - we are done */ - continue; - } - if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state) { - ++num_failed_start; - } else if (ORTE_PROC_STATE_ABORTED == pptr->state) { - ++num_aborted; - } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) { - ++num_killed; - } - } - approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx); - node = proc->node; - if (ORTE_JOB_STATE_FAILED_TO_START == job->state) { - if (NULL == proc) { - orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true, - orterun_basename); - return; - } - switch (OPAL_SOS_GET_ERROR_CODE(proc->exit_code)) { - case ORTE_ERR_SYS_LIMITS_PIPES: - orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true, - orterun_basename, proc->node->name, - (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_PIPE_SETUP_FAILURE: - orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true, - orterun_basename, proc->node->name, - (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_SYS_LIMITS_CHILDREN: - orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true, - orterun_basename, proc->node->name, - (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_FAILED_GET_TERM_ATTRS: - orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true, - orterun_basename, proc->node->name, - (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_WDIR_NOT_FOUND: - orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true, - orterun_basename, approc->cwd, - proc->node->name, (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_EXE_NOT_FOUND: - orte_show_help("help-orterun.txt", "orterun:exe-not-found", true, - orterun_basename, - (unsigned long)proc->name.vpid, - orterun_basename, - orterun_basename, - proc->node->name, - approc->app); - break; - case ORTE_ERR_EXE_NOT_ACCESSIBLE: - orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true, - orterun_basename, approc->app, proc->node->name, - (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_MULTIPLE_AFFINITIES: - orte_show_help("help-orterun.txt", - "orterun:multiple-paffinity-schemes", true, proc->slot_list); - break; - case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: - orte_show_help("help-orterun.txt", - "orterun:topo-not-supported", - true, orte_process_info.nodename, "rankfile containing a slot_list of ", - proc->slot_list, approc->app); - break; - case ORTE_ERR_INVALID_NODE_RANK: - orte_show_help("help-orterun.txt", - "orterun:invalid-node-rank", true); - break; - case ORTE_ERR_INVALID_LOCAL_RANK: - orte_show_help("help-orterun.txt", - "orterun:invalid-local-rank", true); - break; - case ORTE_ERR_NOT_ENOUGH_CORES: - orte_show_help("help-orterun.txt", - "orterun:not-enough-resources", true, - "sockets", node->name, - "bind-to-core", approc->app); - break; - case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED: - orte_show_help("help-orterun.txt", - "orterun:topo-not-supported", - true, node->name, "bind-to-core", "", - approc->app); - break; - case ORTE_ERR_INVALID_PHYS_CPU: - orte_show_help("help-orterun.txt", - "orterun:invalid-phys-cpu", true); - break; - case ORTE_ERR_NOT_ENOUGH_SOCKETS: - orte_show_help("help-orterun.txt", - "orterun:not-enough-resources", true, - "sockets", node->name, - "bind-to-socket", approc->app); - break; - case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED: - orte_show_help("help-orterun.txt", - "orterun:topo-not-supported", - true, node->name, "bind-to-socket", "", - approc->app); - break; - case ORTE_ERR_MODULE_NOT_FOUND: - orte_show_help("help-orterun.txt", - "orterun:paffinity-missing-module", - true, node->name); - break; - case ORTE_ERR_SLOT_LIST_RANGE: - orte_show_help("help-orterun.txt", - "orterun:invalid-slot-list-range", - true, node->name, proc->slot_list); - break; - case ORTE_ERR_PIPE_READ_FAILURE: - orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true, - orterun_basename, node->name, (unsigned long)proc->name.vpid); - break; - case ORTE_ERR_SOCKET_NOT_AVAILABLE: - orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true, - orterun_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, - (unsigned long)proc->name.vpid); - break; - - default: - if (0 != proc->exit_code) { - orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, - orterun_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, - (unsigned long)proc->name.vpid); - } else { - orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true, - orterun_basename, node->name); - } - break; - } - } else if (ORTE_JOB_STATE_ABORTED == job->state) { - if (NULL == proc) { - orte_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true, - orterun_basename); - } else { - orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true, - orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, - node->name, orterun_basename); - } - } else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */ - if (NULL == proc) { - orte_show_help("help-orterun.txt", "orterun:proc-aborted-signal-unknown", true, - orterun_basename); - } else { -#ifdef HAVE_STRSIGNAL - if (NULL != strsignal(WTERMSIG(proc->exit_code))) { - orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true, - orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, - node->name, WTERMSIG(proc->exit_code), - strsignal(WTERMSIG(proc->exit_code))); - } else { -#endif - orte_show_help("help-orterun.txt", "orterun:proc-aborted", true, - orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, - node->name, WTERMSIG(proc->exit_code)); -#ifdef HAVE_STRSIGNAL - } -#endif - } - } else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */ - if (NULL == proc) { - orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true, - orterun_basename, orterun_basename); - } else { - orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true, - orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, - node->name, orterun_basename, orterun_basename); - } - } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) { - orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true, - ORTE_NAME_PRINT(&proc->name), node->name); - } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) { - switch (proc->exit_code) { - case ORTE_ERR_MEM_LIMIT_EXCEEDED: - orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true, - ORTE_NAME_PRINT(&proc->name), node->name); - break; - case ORTE_ERR_PROC_STALLED: - orte_show_help("help-orterun.txt", "orterun:proc-stalled", true); - break; - - default: - orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true); - break; - } - } else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) { - orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true, - orterun_basename, - (0 == strncmp("orte", orterun_basename, 4)) ? "orte" : "MPI"); - } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) { - orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true, - orterun_basename, ORTE_NAME_PRINT(&proc->name), node->name); - } - return; - } - } -} - -static void abort_exit_callback(int fd, short ign, void *arg) -{ - int ret; - - fprintf(stderr, "%s: killing job...\n\n", orterun_basename); - - /* since we are being terminated by a user's signal, be - * sure to exit with a non-zero exit code - but don't - * overwrite any error code from a proc that might have - * failed, in case that is why the user ordered us - * to terminate - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - - /* terminate the job - this will also wakeup orterun so - * it can report to the user and kill all the orteds. - * Check the jobid, though, just in case the user - * hit ctrl-c before we had a chance to setup the - * job in the system - in which case there is nothing - * to terminate! - */ - if (NULL != jdata && - jdata->jobid != ORTE_JOBID_INVALID && - !orte_never_launched) { - /* if the debuggers were run, clean up */ - orte_debugger.finalize(); - - /* - * Turn off the process recovery functionality, if it was enabled. - * This keeps the errmgr from trying to recover from the shutdown - * procedure. - */ - orte_enable_recovery = false; - - /* terminate the orteds - they will automatically kill - * their local procs - */ - ret = orte_plm.terminate_orteds(); - if (ORTE_SUCCESS != ret) { - /* If we failed the terminate_orteds() above, then we - * need to just die - */ - just_quit(fd, ign, arg); - } - /* give ourselves a time limit on how long to wait - * for the job to die, just in case we can't make it go - * away for some reason. Don't send us directly back - * to job_completed, though, as that function may be - * what has failed - */ - ORTE_DETECT_TIMEOUT(&abort_exit_event, jdata->num_procs, - orte_timeout_usec_per_proc, - orte_max_timeout, - just_quit); - - } else { - /* if the jobid is invalid or we never launched, - * there is nothing to do but just clean ourselves - * up and exit - */ - just_quit(fd, ign, arg); - } -} - -/* - * Attempt to terminate the job and wait for callback indicating - * the job has been aborted. - */ -static void abort_signal_callback(int fd, short flags, void *arg) -{ - /* if we have already ordered this once, don't keep - * doing it to avoid race conditions - */ - if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ - if (forcibly_die) { - /* kill any local procs */ - orte_odls.kill_local_procs(NULL); - - /* whack any lingering session directory files from our jobs */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* cleanup our data server */ - orte_data_server_finalize(); - - /* exit with a non-zero status */ - exit(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orterun_basename); - forcibly_die = true; - return; - } - - /* set the global abnormal exit flag so we know not to - * use the standard xcast for terminating orteds - */ - orte_abnormal_term_ordered = true; - /* ensure that the forwarding of stdin stops */ - orte_job_term_ordered = true; - - /* tell us to be quiet - hey, the user killed us with a ctrl-c, - * so need to tell them that! - */ - orterun_globals.quiet = true; - - /* We are in an event handler; the job completed procedure - will delete the signal handler that is currently running - (which is a Bad Thing), so we can't call it directly. - Instead, we have to exit this handler and setup to call - job_completed() after this. */ - ORTE_TIMER_EVENT(0, 0, abort_exit_callback); -} - -/** - * Deal with sigpipe errors - */ -static void epipe_signal_callback(int fd, short flags, void *arg) -{ - /* for now, we just announce and ignore them */ - OPAL_OUTPUT_VERBOSE((1, orte_debug_verbosity, - "%s reports a SIGPIPE error on fd %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd)); - return; -} - -/** - * Pass user signals to the remote application processes - */ -static void signal_forward_callback(int fd, short event, void *arg) -{ - struct opal_event *signal = (struct opal_event*)arg; - int signum, ret; - - signum = OPAL_EVENT_SIGNAL(signal); - if (!orterun_globals.quiet){ - fprintf(stderr, "%s: Forwarding signal %d to job\n", - orterun_basename, signum); - } - - /** send the signal out to the processes, including any descendants */ - if (ORTE_SUCCESS != (ret = orte_plm.signal_job(jdata->jobid, signum))) { - fprintf(stderr, "Signal %d could not be sent to the job (returned %d)", - signum, ret); - } -} - - static int init_globals(void) { /* Only CONSTRUCT things once */ @@ -1389,7 +784,6 @@ static int init_globals(void) orterun_globals.help = false; orterun_globals.version = false; orterun_globals.verbose = false; - orterun_globals.quiet = false; orterun_globals.by_node = false; orterun_globals.by_slot = false; orterun_globals.by_board = false; @@ -1429,13 +823,13 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) if (orterun_globals.version && !(1 == argc || orterun_globals.help)) { char *project_name = NULL; - if (0 == strcmp(orterun_basename, "mpirun")) { + if (0 == strcmp(orte_basename, "mpirun")) { project_name = "Open MPI"; } else { project_name = "OpenRTE"; } orte_show_help("help-orterun.txt", "orterun:version", false, - orterun_basename, project_name, OPAL_VERSION, + orte_basename, project_name, OPAL_VERSION, PACKAGE_BUGREPORT); /* if we were the only argument, exit */ if (2 == argc) exit(0); @@ -1445,15 +839,15 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) if (1 == argc || orterun_globals.help) { char *args = NULL; char *project_name = NULL; - if (0 == strcmp(orterun_basename, "mpirun")) { + if (0 == strcmp(orte_basename, "mpirun")) { project_name = "Open MPI"; } else { project_name = "OpenRTE"; } args = opal_cmd_line_get_usage_msg(cmd_line); orte_show_help("help-orterun.txt", "orterun:usage", false, - orterun_basename, project_name, OPAL_VERSION, - orterun_basename, args, + orte_basename, project_name, OPAL_VERSION, + orte_basename, args, PACKAGE_BUGREPORT); free(args); @@ -1474,7 +868,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) fp = fopen(orterun_globals.report_pid, "w"); if (NULL == fp) { orte_show_help("help-orterun.txt", "orterun:write_file", false, - orterun_basename, "pid", orterun_globals.report_pid); + orte_basename, "pid", orterun_globals.report_pid); exit(0); } fprintf(fp, "%d\n", (int)getpid()); @@ -1485,7 +879,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) /* Do we want a user-level debugger? */ if (orterun_globals.debugger) { - run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs); + run_debugger(orte_basename, cmd_line, argc, argv, orterun_globals.num_procs); } /* extract any rank assignment policy directives */ @@ -1555,7 +949,7 @@ static int parse_locals(int argc, char* argv[]) if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, - orterun_basename, orterun_globals.ompi_server); + orte_basename, orterun_globals.ompi_server); exit(1); } ++filename; /* space past the : */ @@ -1563,7 +957,7 @@ static int parse_locals(int argc, char* argv[]) if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, - orterun_basename, orterun_globals.ompi_server); + orte_basename, orterun_globals.ompi_server); exit(1); } @@ -1571,15 +965,15 @@ static int parse_locals(int argc, char* argv[]) fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, - orterun_basename, orterun_globals.ompi_server); + orte_basename, orterun_globals.ompi_server); exit(1); } if (NULL == fgets(input, 1024, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, - orterun_basename, orterun_globals.ompi_server, - orterun_basename); + orte_basename, orterun_globals.ompi_server, + orte_basename); exit(1); } fclose(fp); @@ -1597,8 +991,8 @@ static int parse_locals(int argc, char* argv[]) if (NULL == ptr) { /* pid is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true, - orterun_basename, orterun_basename, - orterun_globals.ompi_server, orterun_basename); + orte_basename, orte_basename, + orterun_globals.ompi_server, orte_basename); exit(1); } ++ptr; /* space past the : */ @@ -1606,8 +1000,8 @@ static int parse_locals(int argc, char* argv[]) if (0 >= strlen(ptr)) { /* they forgot to give us the pid! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true, - orterun_basename, orterun_basename, - orterun_globals.ompi_server, orterun_basename); + orte_basename, orte_basename, + orterun_globals.ompi_server, orte_basename); exit(1); } @@ -1621,7 +1015,7 @@ static int parse_locals(int argc, char* argv[]) &orte_process_info.top_session_dir, NULL, NULL, NULL))) { orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true, - orterun_basename, orterun_basename); + orte_basename, orte_basename); exit(1); } @@ -1630,7 +1024,7 @@ static int parse_locals(int argc, char* argv[]) /* get the list of HNPs, but do -not- setup contact info to them in the RML */ if (ORTE_SUCCESS != (rc = orte_list_local_hnps(&hnp_list, false))) { orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true, - orterun_basename, orterun_basename); + orte_basename, orte_basename); exit(1); } @@ -1645,8 +1039,8 @@ static int parse_locals(int argc, char* argv[]) } /* if we got here, it wasn't found */ orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-not-found", true, - orterun_basename, orterun_basename, pid, orterun_globals.ompi_server, - orterun_basename); + orte_basename, orte_basename, pid, orterun_globals.ompi_server, + orte_basename); OBJ_DESTRUCT(&hnp_list); exit(1); hnp_found: @@ -1843,7 +1237,7 @@ static int capture_cmd_line_params(int argc, int start, char **argv) * and abort as we cannot know which one is correct */ orte_show_help("help-orterun.txt", "orterun:conflicting-params", - true, orterun_basename, argv[i+1], + true, orte_basename, argv[i+1], argv[i+2], orted_cmd_line[j+1]); return ORTE_ERR_BAD_PARAM; } @@ -1945,7 +1339,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, if (0 == count) { orte_show_help("help-orterun.txt", "orterun:executable-not-specified", - true, orterun_basename, orterun_basename); + true, orte_basename, orte_basename); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } @@ -2102,7 +1496,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, param_len--; if (0 == param_len) { orte_show_help("help-orterun.txt", "orterun:empty-prefix", - true, orterun_basename, orterun_basename); + true, orte_basename, orte_basename); return ORTE_ERR_FATAL; } } @@ -2118,7 +1512,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { if(1 < j) { orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", - true, orterun_basename, NULL); + true, orte_basename, NULL); return ORTE_ERR_FATAL; } else { value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); @@ -2128,7 +1522,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { if(1 < j || NULL != app->hostfile) { orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", - true, orterun_basename, NULL); + true, orte_basename, NULL); return ORTE_ERR_FATAL; } else { value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); @@ -2169,7 +1563,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, * then give us another application. */ orte_show_help("help-orterun.txt", "orterun:multi-apps-and-zero-np", - true, orterun_basename, NULL); + true, orte_basename, NULL); return ORTE_ERR_FATAL; } @@ -2195,7 +1589,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, app->app = strdup(app->argv[0]); if (NULL == app->app) { orte_show_help("help-orterun.txt", "orterun:call-failed", - true, orterun_basename, "library", "strdup returned NULL", errno); + true, orte_basename, "library", "strdup returned NULL", errno); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } diff --git a/orte/tools/orterun/orterun.h b/orte/tools/orterun/orterun.h index a399e2b4f3..1ab5112caa 100644 --- a/orte/tools/orterun/orterun.h +++ b/orte/tools/orterun/orterun.h @@ -37,7 +37,6 @@ struct orterun_globals_t { bool help; bool version; bool verbose; - bool quiet; char *report_pid; char *report_uri; bool exit;