Start reducing our dependency on the event library by removing at least one instance where we use it to redirect the program counter. Rolf reported occasional hangs of mpirun in very specific circumstances after all daemons were done. A review of MTT results indicates this may have been happening more generally in a small fraction of cases.
The problem was tracked to use of the grpcomm.onesided_barrier to control daemon/mpirun termination. This relied on messaging -and- required that the program counter jump from the errmgr back to grpcomm. On rare occasions, this jump did not occur, causing mpirun to hang. This patch looks more invasive than it is - most of the affected files simply had one or two lines removed. The essence of the change is: * pulled the job_complete and quit routines out of orterun and orted_main and put them in a common place * modified the errmgr to directly call the new routines when termination is detected * removed the grpcomm.onesided_barrier and its associated RML tag * add a new "num_routes" API to the routed framework that reports back the number of dependent routes. When route_lost is called, the daemon's list of "children" is checked and adjusted if that route went to a "leaf" in the routing tree * use connection termination between daemons to track rollup of the daemon tree. Daemons and HNP now terminate once num_routes returns zero Also picked up in this commit is the addition of a new bool flag to the app_context struct, and increasing the job_control field from 8 to 16 bits. Both trivial. This commit was SVN r23429.
Этот коммит содержится в:
родитель
acd990ffe5
Коммит
12cd07c9a9
@ -27,12 +27,6 @@
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
@ -43,6 +37,15 @@
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
@ -296,8 +299,15 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
/* get the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
/* if the orteds are terminating, check job complete */
|
||||
if (orte_orteds_term_ordered) {
|
||||
opal_output(0, "TERM ORDERED - CHECKING COMPLETE");
|
||||
check_job_complete(NULL);
|
||||
return ORTE_SUCCESS;
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
/* update is for a specific proc */
|
||||
@ -390,38 +400,48 @@ static int update_state(orte_jobid_t job,
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
/* is this to a daemon? */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
/* if we have ordered orteds to terminate, see if this one failed to tell
|
||||
* us it had terminated
|
||||
*/
|
||||
/* if this is my own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s My own connection - ignoring it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
break;
|
||||
}
|
||||
/* if we have ordered orteds to terminate, record it */
|
||||
if (orte_orteds_term_ordered) {
|
||||
if (orte_orted_exit_with_barrier) {
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
} else {
|
||||
record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s Daemons terminating - recording daemon %s as gone",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* update daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
/* check for complete */
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
/* if abort is in progress, see if this one failed to tell
|
||||
* us it had terminated
|
||||
*/
|
||||
if (orte_abnormal_term_ordered) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s Abort in progress - recording daemon %s as gone",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* update daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* check for complete */
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
/* if this is my own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
break;
|
||||
}
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
|
||||
if (orte_enable_recovery) {
|
||||
/* relocate its processes */
|
||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
|
||||
@ -755,7 +775,14 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
/* Check if FileM is active. If so then keep processing. */
|
||||
OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active);
|
||||
#endif
|
||||
|
||||
if (NULL == jdata) {
|
||||
/* just check to see if the daemons are complete */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_complete - received NULL job, checking daemons",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto CHECK_DAEMONS;
|
||||
}
|
||||
|
||||
for (i=0; i < jdata->procs->size && !jdata->abort; i++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
/* the proc array may no longer be left justified, so
|
||||
@ -978,14 +1005,21 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
* This can happen if a ctrl-c hits in the "wrong" place
|
||||
* while launching
|
||||
*/
|
||||
CHECK_DAEMONS:
|
||||
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
/* orteds are done! */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s orteds complete - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (NULL == jdata) {
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
}
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
orte_trigger_event(&orteds_exit);
|
||||
orte_quit();
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Release the resources used by this job. Since some errmgrs may want
|
||||
@ -1094,15 +1128,22 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return;
|
||||
}
|
||||
/* if we get here, then all jobs are done, so wakeup */
|
||||
/* if we get here, then all jobs are done, so terminate */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed all jobs terminated - waking up",
|
||||
"%s errmgr:hnp:check_job_completed all jobs terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* set the exit status to 0 - this will only happen if it
|
||||
* wasn't already set by an error condition
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
/* if I am the only daemon alive, then I can exit now */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s orteds complete - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
@ -244,16 +245,23 @@ static int update_state(orte_jobid_t job,
|
||||
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
/* see if this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
/* kill our children */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* tell the caller we can't recover */
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
/* terminate - our routed children will see
|
||||
* us leave and automatically die
|
||||
*/
|
||||
orte_quit();
|
||||
}
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
/* was it a daemon that failed? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if all my routes are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
/* if not, then indicate we can continue */
|
||||
return ORTE_SUCCESS;
|
||||
@ -272,10 +280,17 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
/* must already be complete */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if there are no local procs for this job, we can
|
||||
* ignore this call
|
||||
*/
|
||||
if (0 == jobdat->num_local_procs) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted got state %s for proc %s pid %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include "opal/mca/pstat/base/base.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/mca/sysinfo/base/base.h"
|
||||
#include "opal/util/os_path.h"
|
||||
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
@ -66,18 +67,61 @@
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
|
||||
static bool plm_in_use;
|
||||
/* local globals */
|
||||
static bool plm_in_use=false;
|
||||
static bool signals_set=false;
|
||||
static struct opal_event term_handler;
|
||||
static struct opal_event int_handler;
|
||||
static struct opal_event epipe_handler;
|
||||
#ifndef __WINDOWS__
|
||||
static struct opal_event sigusr1_handler;
|
||||
static struct opal_event sigusr2_handler;
|
||||
#endif /* __WINDOWS__ */
|
||||
char *log_path = NULL;
|
||||
static void shutdown_signal(int fd, short flags, void *arg);
|
||||
static void signal_callback(int fd, short flags, void *arg);
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg);
|
||||
|
||||
int orte_ess_base_orted_setup(char **hosts)
|
||||
{
|
||||
int ret;
|
||||
int fd;
|
||||
char log_file[PATH_MAX];
|
||||
char *jobidstring;
|
||||
char *error = NULL;
|
||||
char *plm_to_use;
|
||||
int value;
|
||||
|
||||
#ifndef __WINDOWS__
|
||||
/* setup callback for SIGPIPE */
|
||||
opal_signal_set(&epipe_handler, SIGPIPE,
|
||||
epipe_signal_callback, &epipe_handler);
|
||||
opal_signal_add(&epipe_handler, NULL);
|
||||
/* Set signal handlers to catch kill signals so we can properly clean up
|
||||
* after ourselves.
|
||||
*/
|
||||
opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL,
|
||||
shutdown_signal, NULL);
|
||||
opal_event_add(&term_handler, NULL);
|
||||
opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL,
|
||||
shutdown_signal, NULL);
|
||||
opal_event_add(&int_handler, NULL);
|
||||
|
||||
/** setup callbacks for signals we should ignore */
|
||||
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
||||
signal_callback, &sigusr1_handler);
|
||||
opal_signal_add(&sigusr1_handler, NULL);
|
||||
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
||||
signal_callback, &sigusr2_handler);
|
||||
opal_signal_add(&sigusr2_handler, NULL);
|
||||
#endif /* __WINDOWS__ */
|
||||
|
||||
signals_set = true;
|
||||
|
||||
/* initialize the global list of local children and job data */
|
||||
OBJ_CONSTRUCT(&orte_local_children, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t);
|
||||
@ -321,10 +365,48 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
/* Once the session directory location has been established, set
|
||||
the opal_output env file location to be in the
|
||||
proc-specific session directory. */
|
||||
the opal_output env file location to be in the
|
||||
proc-specific session directory. */
|
||||
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
|
||||
"output-", NULL, NULL);
|
||||
|
||||
/* setup stdout/stderr */
|
||||
if (orte_debug_daemons_file_flag) {
|
||||
/* if we are debugging to a file, then send stdout/stderr to
|
||||
* the orted log file
|
||||
*/
|
||||
|
||||
/* get my jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
|
||||
ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "convert_jobid";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* define a log file name in the session directory */
|
||||
snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
|
||||
jobidstring, orte_process_info.nodename);
|
||||
log_path = opal_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
log_file,
|
||||
NULL);
|
||||
|
||||
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
|
||||
if (fd < 0) {
|
||||
/* couldn't open the file for some reason, so
|
||||
* just connect everything to /dev/null
|
||||
*/
|
||||
fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
|
||||
} else {
|
||||
dup2(fd, STDOUT_FILENO);
|
||||
dup2(fd, STDERR_FILENO);
|
||||
if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
|
||||
close(fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the routed info - the selected routed component
|
||||
@ -434,7 +516,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
error:
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
@ -447,14 +529,27 @@ int orte_ess_base_orted_finalize(void)
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* ensure all the orteds depart together */
|
||||
if (!orte_abnormal_term_ordered) {
|
||||
/* if we are abnormally terminating, don't attempt
|
||||
* to do a barrier as nobody else will be entering
|
||||
* that call
|
||||
*/
|
||||
orte_grpcomm.onesided_barrier();
|
||||
if (signals_set) {
|
||||
/* Release all local signal handlers */
|
||||
opal_event_del(&epipe_handler);
|
||||
opal_event_del(&term_handler);
|
||||
opal_event_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != log_path) {
|
||||
unlink(log_path);
|
||||
}
|
||||
|
||||
/* make sure our local procs are dead */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
orte_sensor_base_close();
|
||||
orte_db_base_close();
|
||||
@ -493,3 +588,29 @@ int orte_ess_base_orted_finalize(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void shutdown_signal(int fd, short flags, void *arg)
|
||||
{
|
||||
/* trigger the call to shutdown callback to protect
|
||||
* against race conditions - the trigger event will
|
||||
* check the one-time lock
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
orte_quit();
|
||||
}
|
||||
|
||||
/**
|
||||
* Deal with sigpipe errors
|
||||
*/
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
/* for now, we just announce and ignore them */
|
||||
opal_output(0, "%s reports a SIGPIPE error on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd);
|
||||
return;
|
||||
}
|
||||
|
||||
static void signal_callback(int fd, short event, void *arg)
|
||||
{
|
||||
/* just ignore these signals */
|
||||
}
|
||||
|
@ -43,7 +43,6 @@
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/mca/sysinfo/base/base.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
@ -53,6 +52,7 @@
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/rmcast/base/base.h"
|
||||
@ -60,12 +60,14 @@
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/hnp_contact.h"
|
||||
@ -76,8 +78,11 @@
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/ess/hnp/ess_hnp.h"
|
||||
@ -108,6 +113,23 @@ orte_ess_base_module_t orte_ess_hnp_module = {
|
||||
NULL /* ft_event */
|
||||
};
|
||||
|
||||
/* local globals */
|
||||
static bool signals_set=false;
|
||||
static struct opal_event term_handler;
|
||||
static struct opal_event int_handler;
|
||||
static struct opal_event epipe_handler;
|
||||
#ifndef __WINDOWS__
|
||||
static struct opal_event sigusr1_handler;
|
||||
static struct opal_event sigusr2_handler;
|
||||
static struct opal_event sigtstp_handler;
|
||||
static struct opal_event sigcont_handler;
|
||||
#endif /* __WINDOWS__ */
|
||||
|
||||
static void abort_signal_callback(int fd, short flags, void *arg);
|
||||
static void abort_exit_callback(int fd, short event, void *arg);
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg);
|
||||
static void signal_forward_callback(int fd, short event, void *arg);
|
||||
|
||||
static int rte_init(void)
|
||||
{
|
||||
int ret;
|
||||
@ -124,6 +146,41 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
#ifndef __WINDOWS__
|
||||
/* setup callback for SIGPIPE */
|
||||
opal_signal_set(&epipe_handler, SIGPIPE,
|
||||
epipe_signal_callback, &epipe_handler);
|
||||
opal_signal_add(&epipe_handler, NULL);
|
||||
/** setup callbacks for abort signals - from this point
|
||||
* forward, we need to abort in a manner that allows us
|
||||
* to cleanup
|
||||
*/
|
||||
opal_signal_set(&term_handler, SIGTERM,
|
||||
abort_signal_callback, &term_handler);
|
||||
opal_signal_add(&term_handler, NULL);
|
||||
opal_signal_set(&int_handler, SIGINT,
|
||||
abort_signal_callback, &int_handler);
|
||||
opal_signal_add(&int_handler, NULL);
|
||||
|
||||
/** setup callbacks for signals we should foward */
|
||||
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
||||
signal_forward_callback, &sigusr1_handler);
|
||||
opal_signal_add(&sigusr1_handler, NULL);
|
||||
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
||||
signal_forward_callback, &sigusr2_handler);
|
||||
opal_signal_add(&sigusr2_handler, NULL);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_set(&sigtstp_handler, SIGTSTP,
|
||||
signal_forward_callback, &sigtstp_handler);
|
||||
opal_signal_add(&sigtstp_handler, NULL);
|
||||
opal_signal_set(&sigcont_handler, SIGCONT,
|
||||
signal_forward_callback, &sigcont_handler);
|
||||
opal_signal_add(&sigcont_handler, NULL);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
|
||||
signals_set = true;
|
||||
|
||||
/* determine the topology info */
|
||||
if (0 == orte_default_num_sockets_per_board) {
|
||||
/* we weren't given a number, so try to determine it */
|
||||
@ -615,6 +672,24 @@ static int rte_finalize(void)
|
||||
orte_job_t *job;
|
||||
int i;
|
||||
|
||||
if (signals_set) {
|
||||
/* Remove the epipe handler */
|
||||
opal_signal_del(&epipe_handler);
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_del(&sigtstp_handler);
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
/* stop the debuggers */
|
||||
orte_debugger_base_close();
|
||||
|
||||
@ -879,3 +954,132 @@ static int update_nidmap(opal_byte_object_t *bo)
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static bool forcibly_die=false;
|
||||
|
||||
static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
fprintf(stderr, "%s: killing job...\n\n", orte_basename);
|
||||
|
||||
/* since we are being terminated by a user's signal, be
|
||||
* sure to exit with a non-zero exit code - but don't
|
||||
* overwrite any error code from a proc that might have
|
||||
* failed, in case that is why the user ordered us
|
||||
* to terminate
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
|
||||
/* terminate the job - this will also wakeup orterun so
|
||||
* it can report to the user and kill all the orteds.
|
||||
* Check the jobid, though, just in case the user
|
||||
* hit ctrl-c before we had a chance to setup the
|
||||
* job in the system - in which case there is nothing
|
||||
* to terminate!
|
||||
*/
|
||||
if (!orte_never_launched) {
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
/*
|
||||
* Turn off the process recovery functionality, if it was enabled.
|
||||
* This keeps the errmgr from trying to recover from the shutdown
|
||||
* procedure.
|
||||
*/
|
||||
orte_enable_recovery = false;
|
||||
|
||||
/* terminate the orteds - they will automatically kill
|
||||
* their local procs
|
||||
*/
|
||||
ret = orte_plm.terminate_orteds();
|
||||
|
||||
} else {
|
||||
/* if the jobid is invalid or we never launched,
|
||||
* there is nothing to do but just clean ourselves
|
||||
* up and exit
|
||||
*/
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to terminate the job and wait for callback indicating
|
||||
* the job has been aborted.
|
||||
*/
|
||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
/* if we have already ordered this once, don't keep
|
||||
* doing it to avoid race conditions
|
||||
*/
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
if (forcibly_die) {
|
||||
/* kill any local procs */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* exit with a non-zero status */
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orte_basename);
|
||||
forcibly_die = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* set the global abnormal exit flag so we know not to
|
||||
* use the standard xcast for terminating orteds
|
||||
*/
|
||||
orte_abnormal_term_ordered = true;
|
||||
/* ensure that the forwarding of stdin stops */
|
||||
orte_job_term_ordered = true;
|
||||
|
||||
/* tell us to be quiet - hey, the user killed us with a ctrl-c,
|
||||
* so need to tell them that!
|
||||
*/
|
||||
orte_execute_quiet = true;
|
||||
|
||||
/* We are in an event handler; the job completed procedure
|
||||
will delete the signal handler that is currently running
|
||||
(which is a Bad Thing), so we can't call it directly.
|
||||
Instead, we have to exit this handler and setup to call
|
||||
job_completed() after this. */
|
||||
ORTE_TIMER_EVENT(0, 0, abort_exit_callback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deal with sigpipe errors
|
||||
*/
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
/* for now, we just announce and ignore them */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_verbosity,
|
||||
"%s reports a SIGPIPE error on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd));
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass user signals to the remote application processes
|
||||
*/
|
||||
static void signal_forward_callback(int fd, short event, void *arg)
|
||||
{
|
||||
struct opal_event *signal = (struct opal_event*)arg;
|
||||
int signum, ret;
|
||||
|
||||
signum = OPAL_EVENT_SIGNAL(signal);
|
||||
if (!orte_execute_quiet){
|
||||
fprintf(stderr, "%s: Forwarding signal %d to job\n",
|
||||
orte_basename, signum);
|
||||
}
|
||||
|
||||
/** send the signal out to the processes, including any descendants */
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.signal_job(ORTE_JOBID_WILDCARD, signum))) {
|
||||
fprintf(stderr, "Signal %d could not be sent to the job (returned %d)",
|
||||
signum, ret);
|
||||
}
|
||||
}
|
||||
|
@ -164,8 +164,6 @@ static int rte_finalize(void)
|
||||
|
||||
/* if I am a daemon, finalize using the default procedure */
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
/* don't need to do the barrier */
|
||||
orte_orted_exit_with_barrier = false;
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
@ -52,7 +52,7 @@
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
@ -195,7 +195,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
|
||||
if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* get the proc object for it */
|
||||
@ -203,7 +203,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
|
||||
if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
@ -213,7 +213,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
@ -299,13 +299,13 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
|
@ -51,7 +51,6 @@ static int xcast(orte_jobid_t job,
|
||||
orte_rml_tag_t tag);
|
||||
static int bad_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf);
|
||||
static int bad_barrier(void);
|
||||
static int bad_onesided_barrier(void);
|
||||
static int modex(opal_list_t *procs);
|
||||
|
||||
/* Module def */
|
||||
@ -62,7 +61,6 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = {
|
||||
bad_allgather,
|
||||
orte_grpcomm_base_allgather_list,
|
||||
bad_barrier,
|
||||
bad_onesided_barrier,
|
||||
orte_grpcomm_base_set_proc_attr,
|
||||
orte_grpcomm_base_get_proc_attr,
|
||||
modex,
|
||||
@ -70,7 +68,7 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = {
|
||||
};
|
||||
|
||||
/* Local variables */
|
||||
static orte_grpcomm_collective_t barrier, allgather, onesided_barrier;
|
||||
static orte_grpcomm_collective_t barrier, allgather;
|
||||
|
||||
/**
|
||||
* Initialize the module
|
||||
@ -87,7 +85,6 @@ static int init(void)
|
||||
/* setup global variables */
|
||||
OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t);
|
||||
OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t);
|
||||
OBJ_CONSTRUCT(&onesided_barrier, orte_grpcomm_collective_t);
|
||||
|
||||
/* if we are a daemon or the hnp, we need to post a
|
||||
* recv to catch any collective operations
|
||||
@ -115,7 +112,6 @@ static void finalize(void)
|
||||
/* destruct the globals */
|
||||
OBJ_DESTRUCT(&barrier);
|
||||
OBJ_DESTRUCT(&allgather);
|
||||
OBJ_DESTRUCT(&onesided_barrier);
|
||||
|
||||
/* if we are a daemon or the hnp, we need to cancel the
|
||||
* recv we posted
|
||||
@ -229,124 +225,6 @@ static int bad_barrier(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void onesided_barrier_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata;
|
||||
|
||||
OPAL_THREAD_LOCK(&coll->lock);
|
||||
/* flag as recvd */
|
||||
coll->recvd += 1;
|
||||
if (orte_process_info.num_procs == coll->recvd) {
|
||||
opal_condition_broadcast(&coll->cond);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&coll->lock);
|
||||
}
|
||||
|
||||
/* quick timeout loop */
|
||||
static bool timer_fired;
|
||||
|
||||
static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* declare it fired */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
static int bad_onesided_barrier(void)
|
||||
{
|
||||
opal_list_t daemon_tree;
|
||||
opal_list_item_t *item;
|
||||
opal_buffer_t buf;
|
||||
orte_process_name_t my_parent;
|
||||
opal_event_t *quicktime=NULL;
|
||||
struct timeval quicktimeval;
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad: onesided barrier called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are not to use the barrier, then just return */
|
||||
if (!orte_orted_exit_with_barrier) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* if we are the HNP, we need to do a little delay to give
|
||||
* the orteds a chance to exit before we leave
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad: onesided barrier adding delay timer",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
quicktimeval.tv_sec = 0;
|
||||
quicktimeval.tv_usec = 100;
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* figure out how many participants we should be expecting */
|
||||
OBJ_CONSTRUCT(&daemon_tree, opal_list_t);
|
||||
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree);
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
onesided_barrier.recvd += orte_process_info.num_procs - opal_list_get_size(&daemon_tree);
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad: onesided barrier num_participating %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)(orte_process_info.num_procs - opal_list_get_size(&daemon_tree))));
|
||||
|
||||
/* disassemble the daemon tree */
|
||||
while (NULL != (item = opal_list_remove_first(&daemon_tree))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&daemon_tree);
|
||||
|
||||
/* set the recv */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_ONESIDED_BARRIER,
|
||||
ORTE_RML_PERSISTENT,
|
||||
onesided_barrier_recv,
|
||||
&onesided_barrier))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* wait to get all my inputs */
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
while (onesided_barrier.recvd < orte_process_info.num_procs) {
|
||||
opal_condition_wait(&onesided_barrier.cond, &onesided_barrier.lock);
|
||||
}
|
||||
/* reset the collective */
|
||||
onesided_barrier.recvd = 0;
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
|
||||
/* cancel the recv */
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER);
|
||||
|
||||
/* if I am the HNP, then we are done */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* send a zero-byte msg to my parent */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* send it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:bad:onsided:barrier not the HNP - sending to parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&my_parent)));
|
||||
if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void allgather_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
|
@ -55,7 +55,6 @@ static int xcast(orte_jobid_t job,
|
||||
orte_rml_tag_t tag);
|
||||
static int basic_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf);
|
||||
static int basic_barrier(void);
|
||||
static int basic_onesided_barrier(void);
|
||||
static int modex(opal_list_t *procs);
|
||||
static int set_proc_attr(const char *attr_name, const void *data, size_t size);
|
||||
static int get_proc_attr(const orte_process_name_t proc,
|
||||
@ -70,7 +69,6 @@ orte_grpcomm_base_module_t orte_grpcomm_basic_module = {
|
||||
basic_allgather,
|
||||
orte_grpcomm_base_allgather_list,
|
||||
basic_barrier,
|
||||
basic_onesided_barrier,
|
||||
set_proc_attr,
|
||||
get_proc_attr,
|
||||
modex,
|
||||
@ -78,7 +76,7 @@ orte_grpcomm_base_module_t orte_grpcomm_basic_module = {
|
||||
};
|
||||
|
||||
/* Local variables */
|
||||
static orte_grpcomm_collective_t barrier, allgather, onesided_barrier;
|
||||
static orte_grpcomm_collective_t barrier, allgather;
|
||||
|
||||
static bool recv_on;
|
||||
static opal_buffer_t *profile_buf=NULL;
|
||||
@ -118,7 +116,6 @@ static int init(void)
|
||||
/* setup global variables */
|
||||
OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t);
|
||||
OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t);
|
||||
OBJ_CONSTRUCT(&onesided_barrier, orte_grpcomm_collective_t);
|
||||
|
||||
if (ORTE_PROC_IS_HNP && recv_on) {
|
||||
/* open the profile file for writing */
|
||||
@ -186,7 +183,6 @@ static void finalize(void)
|
||||
/* destruct the globals */
|
||||
OBJ_DESTRUCT(&barrier);
|
||||
OBJ_DESTRUCT(&allgather);
|
||||
OBJ_DESTRUCT(&onesided_barrier);
|
||||
|
||||
if (ORTE_PROC_IS_HNP && recv_on) {
|
||||
/* if we are profiling and I am the HNP, then stop the
|
||||
@ -311,124 +307,6 @@ static int basic_barrier(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void onesided_barrier_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata;
|
||||
|
||||
OPAL_THREAD_LOCK(&coll->lock);
|
||||
/* flag as recvd */
|
||||
coll->recvd += 1;
|
||||
if (orte_process_info.num_procs == coll->recvd) {
|
||||
opal_condition_broadcast(&coll->cond);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&coll->lock);
|
||||
}
|
||||
/* quick timeout loop */
|
||||
static bool timer_fired;
|
||||
|
||||
static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* declare it fired */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
static int basic_onesided_barrier(void)
|
||||
{
|
||||
opal_list_t daemon_tree;
|
||||
opal_list_item_t *item;
|
||||
opal_buffer_t buf;
|
||||
orte_process_name_t my_parent;
|
||||
opal_event_t *quicktime=NULL;
|
||||
struct timeval quicktimeval;
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:basic: onesided barrier called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are not to use the barrier, then just return */
|
||||
if (!orte_orted_exit_with_barrier) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* if we are the HNP, we need to do a little delay to give
|
||||
* the orteds a chance to exit before we leave
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:basic: onesided barrier adding delay timer",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
quicktimeval.tv_sec = 0;
|
||||
quicktimeval.tv_usec = 100;
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* figure out how many participants we should be expecting */
|
||||
OBJ_CONSTRUCT(&daemon_tree, opal_list_t);
|
||||
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree);
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
onesided_barrier.recvd += orte_process_info.num_procs - opal_list_get_size(&daemon_tree);
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:basic: onesided barrier num_participating %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)(orte_process_info.num_procs - opal_list_get_size(&daemon_tree))));
|
||||
|
||||
/* disassemble the daemon tree */
|
||||
while (NULL != (item = opal_list_remove_first(&daemon_tree))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&daemon_tree);
|
||||
|
||||
/* set the recv */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_ONESIDED_BARRIER,
|
||||
ORTE_RML_PERSISTENT,
|
||||
onesided_barrier_recv,
|
||||
&onesided_barrier))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* wait to get all my inputs */
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
while (onesided_barrier.recvd < orte_process_info.num_procs) {
|
||||
opal_condition_wait(&onesided_barrier.cond, &onesided_barrier.lock);
|
||||
}
|
||||
/* reset the collective */
|
||||
onesided_barrier.recvd = 0;
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
|
||||
/* cancel the recv */
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER);
|
||||
|
||||
/* if I am the HNP, then we are done */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* send a zero-byte msg to my parent */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* send it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:basic:onsided:barrier not the HNP - sending to parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&my_parent)));
|
||||
if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void allgather_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
|
@ -75,7 +75,6 @@ orte_grpcomm_base_module_t orte_grpcomm_cnos_module = {
|
||||
allgather,
|
||||
allgather_list,
|
||||
orte_grpcomm_cnos_barrier,
|
||||
orte_grpcomm_cnos_barrier,
|
||||
set_proc_attr,
|
||||
get_proc_attr,
|
||||
modex,
|
||||
|
@ -71,11 +71,6 @@ typedef int (*orte_grpcomm_base_module_allgather_list_fn_t)(opal_list_t *names,
|
||||
/* barrier function */
|
||||
typedef int (*orte_grpcomm_base_module_barrier_fn_t)(void);
|
||||
|
||||
/* one-sided barrier function - process releases once its
|
||||
* contribution is complete
|
||||
*/
|
||||
typedef int (*orte_grpcomm_base_module_onesided_barrier_fn_t)(void);
|
||||
|
||||
|
||||
/** DATA EXCHANGE FUNCTIONS - SEE ompi/runtime/ompi_module_exchange.h FOR A DESCRIPTION
|
||||
* OF HOW THIS ALL WORKS
|
||||
@ -108,7 +103,6 @@ struct orte_grpcomm_base_module_2_0_0_t {
|
||||
orte_grpcomm_base_module_allgather_fn_t allgather;
|
||||
orte_grpcomm_base_module_allgather_list_fn_t allgather_list;
|
||||
orte_grpcomm_base_module_barrier_fn_t barrier;
|
||||
orte_grpcomm_base_module_onesided_barrier_fn_t onesided_barrier;
|
||||
/* modex functions */
|
||||
orte_grpcomm_base_module_modex_set_proc_attr_fn_t set_proc_attr;
|
||||
orte_grpcomm_base_module_modex_get_proc_attr_fn_t get_proc_attr;
|
||||
|
@ -68,7 +68,6 @@ orte_grpcomm_base_module_t orte_grpcomm_hier_module = {
|
||||
hier_allgather,
|
||||
orte_grpcomm_base_allgather_list,
|
||||
hier_barrier,
|
||||
NULL, /* onesided barrier only used by daemons */
|
||||
set_proc_attr,
|
||||
get_proc_attr,
|
||||
modex,
|
||||
|
@ -48,7 +48,6 @@ static int xcast(orte_jobid_t job,
|
||||
orte_rml_tag_t tag);
|
||||
static int mcast_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf);
|
||||
static int mcast_barrier(void);
|
||||
static int mcast_onesided_barrier(void);
|
||||
static int modex(opal_list_t *procs);
|
||||
static int get_proc_attr(const orte_process_name_t proc,
|
||||
const char * attribute_name, void **val,
|
||||
@ -62,7 +61,6 @@ orte_grpcomm_base_module_t orte_grpcomm_mcast_module = {
|
||||
mcast_allgather,
|
||||
orte_grpcomm_base_allgather_list,
|
||||
mcast_barrier,
|
||||
mcast_onesided_barrier,
|
||||
orte_grpcomm_base_set_proc_attr,
|
||||
get_proc_attr,
|
||||
modex,
|
||||
@ -77,7 +75,7 @@ static void daemon_recv(int status,
|
||||
opal_buffer_t *buf, void* cbdata);
|
||||
|
||||
/* Local variables */
|
||||
static orte_grpcomm_collective_t barrier, allgather, onesided_barrier;
|
||||
static orte_grpcomm_collective_t barrier, allgather;
|
||||
|
||||
/**
|
||||
* Initialize the module
|
||||
@ -93,7 +91,6 @@ static int init(void)
|
||||
/* setup global variables */
|
||||
OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t);
|
||||
OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t);
|
||||
OBJ_CONSTRUCT(&onesided_barrier, orte_grpcomm_collective_t);
|
||||
|
||||
/* point to our collective function */
|
||||
orte_grpcomm_base.daemon_coll = orte_grpcomm_mcast_daemon_coll;
|
||||
@ -130,7 +127,6 @@ static void finalize(void)
|
||||
/* destruct the globals */
|
||||
OBJ_DESTRUCT(&barrier);
|
||||
OBJ_DESTRUCT(&allgather);
|
||||
OBJ_DESTRUCT(&onesided_barrier);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -285,73 +281,6 @@ static int mcast_barrier(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* quick timeout loop */
|
||||
static bool timer_fired;
|
||||
|
||||
static void quicktime_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* declare it fired */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
static int mcast_onesided_barrier(void)
|
||||
{
|
||||
opal_event_t *quicktime=NULL;
|
||||
struct timeval quicktimeval;
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:mcast: onesided barrier called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if I am alone, just return */
|
||||
if (1 == orte_process_info.num_procs) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if we are not to use the barrier, then just return */
|
||||
if (!orte_orted_exit_with_barrier) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* if we are the HNP, we need to do a little delay to give
|
||||
* the orteds a chance to exit before we leave
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:mcast: onesided barrier adding delay timer",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
quicktimeval.tv_sec = 0;
|
||||
quicktimeval.tv_usec = 100;
|
||||
timer_fired = false;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb);
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if we are not the HNP, just send and leave */
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
if (ORTE_SUCCESS != (rc = xcast(ORTE_PROC_MY_NAME->jobid, NULL, ORTE_RML_TAG_ONESIDED_BARRIER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* initialize things */
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
onesided_barrier.recvd += 1; /* account for me */
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
|
||||
/* wait to complete */
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
while (orte_process_info.num_procs <= onesided_barrier.recvd) {
|
||||
opal_condition_wait(&onesided_barrier.cond, &onesided_barrier.lock);
|
||||
}
|
||||
/* reset the collective */
|
||||
onesided_barrier.recvd = 0;
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void allgather_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
@ -551,16 +480,6 @@ static void daemon_recv(int status,
|
||||
ORTE_MESSAGE_EVENT(sender, buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
|
||||
break;
|
||||
|
||||
case ORTE_RML_TAG_ONESIDED_BARRIER:
|
||||
OPAL_THREAD_LOCK(&onesided_barrier.lock);
|
||||
onesided_barrier.recvd += 1;
|
||||
/* check for completion */
|
||||
if (orte_process_info.num_procs <= onesided_barrier.recvd) {
|
||||
opal_condition_broadcast(&onesided_barrier.cond);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&onesided_barrier.lock);
|
||||
break;
|
||||
|
||||
case ORTE_RML_TAG_BARRIER:
|
||||
OPAL_THREAD_LOCK(&barrier.lock);
|
||||
/* the recv is the trigger */
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -57,7 +57,7 @@
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -149,7 +149,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
if (NULL == crud) {
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
orte_util_nidmap_init(NULL);
|
||||
@ -173,7 +173,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
free(crud);
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -198,7 +198,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
if (orte_do_not_launch) {
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
@ -214,7 +214,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
ORTE_VPID_PRINT(jdata->num_procs));
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
|
@ -49,7 +49,7 @@
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
@ -509,7 +509,7 @@ static void process_msg(int fd, short event, void *data)
|
||||
|
||||
/* see if an error occurred - if so, wakeup the HNP so we can exit */
|
||||
if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -114,9 +114,6 @@ static int plm_ccp_init(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* we don't need a barrier to exit */
|
||||
orte_orted_exit_with_barrier = false;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -227,9 +227,6 @@ int orte_plm_process_init(void)
|
||||
SecureZeroMemory(user_name, sizeof(user_name));
|
||||
SecureZeroMemory(user_password, sizeof(user_password));
|
||||
|
||||
/* we don't need a barrier to exit */
|
||||
orte_orted_exit_with_barrier = false;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -72,6 +72,7 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -396,7 +397,7 @@ int orte_plm_rshd_terminate_job(orte_jobid_t jobid)
|
||||
*/
|
||||
int orte_plm_rshd_terminate_orteds(void)
|
||||
{
|
||||
orte_trigger_event(&orteds_exit);
|
||||
orte_quit();
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -67,6 +67,7 @@
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
@ -129,9 +130,6 @@ static int plm_slurm_init(void)
|
||||
local_launch_available = true;
|
||||
}
|
||||
|
||||
/* we don't need a barrier to exit */
|
||||
orte_orted_exit_with_barrier = false;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -522,7 +520,7 @@ static int plm_slurm_terminate_orteds(void)
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
/* need to set the #terminated value to avoid an incorrect error msg */
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
orte_trigger_event(&orteds_exit);
|
||||
orte_quit();
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -615,7 +613,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
/* need to set the #terminated value to avoid an incorrect error msg */
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
orte_trigger_event(&orteds_exit);
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -68,6 +68,7 @@
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
@ -716,7 +717,7 @@ int plm_tmd_terminate_orteds(void)
|
||||
} else {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
orte_trigger_event(&orteds_exit);
|
||||
orte_quit();
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/comm/comm.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
|
||||
@ -169,7 +170,7 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_jobs_complete();
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
|
@ -164,30 +164,27 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
|
||||
/* profile data */
|
||||
#define ORTE_RML_TAG_GRPCOMM_PROFILE 33
|
||||
|
||||
/* onesided barrier */
|
||||
#define ORTE_RML_TAG_ONESIDED_BARRIER 34
|
||||
|
||||
/* bootstrap */
|
||||
#define ORTE_RML_TAG_BOOTSTRAP 35
|
||||
#define ORTE_RML_TAG_BOOTSTRAP 34
|
||||
|
||||
/* TCP "fake" multicast */
|
||||
#define ORTE_RML_TAG_MULTICAST 36
|
||||
#define ORTE_RML_TAG_MULTICAST 35
|
||||
/* multicast messages sent direct */
|
||||
#define ORTE_RML_TAG_MULTICAST_DIRECT 37
|
||||
#define ORTE_RML_TAG_MULTICAST_DIRECT 36
|
||||
/* multicast messages to be relayed */
|
||||
#define ORTE_RML_TAG_MULTICAST_RELAY 38
|
||||
#define ORTE_RML_TAG_MULTICAST_RELAY 37
|
||||
|
||||
/* tag for receiving ack of abort msg */
|
||||
#define ORTE_RML_TAG_ABORT 39
|
||||
#define ORTE_RML_TAG_ABORT 38
|
||||
|
||||
/* tag for receiving heartbeats */
|
||||
#define ORTE_RML_TAG_HEARTBEAT 40
|
||||
#define ORTE_RML_TAG_HEARTBEAT 39
|
||||
|
||||
/* notifier data */
|
||||
#define ORTE_RML_TAG_NOTIFIER_HNP 41
|
||||
#define ORTE_RML_TAG_NOTIFIER_HNP 40
|
||||
|
||||
/* comm leader failed */
|
||||
#define ORTE_RML_TAG_LEADER 42
|
||||
#define ORTE_RML_TAG_LEADER 41
|
||||
|
||||
#define ORTE_RML_TAG_MAX 100
|
||||
|
||||
|
@ -46,6 +46,7 @@ static int update_routing_tree(void);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
static size_t num_routes(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int binomial_ft_event(int state);
|
||||
@ -64,6 +65,7 @@ orte_routed_module_t orte_routed_binomial_module = {
|
||||
update_routing_tree,
|
||||
get_routing_tree,
|
||||
get_wireup_info,
|
||||
num_routes,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
binomial_ft_event
|
||||
#else
|
||||
@ -698,6 +700,14 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
||||
|
||||
static int route_lost(const orte_process_name_t *route)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_routed_tree_t *child;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s route to %s lost",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(route)));
|
||||
|
||||
/* if we lose the connection to the lifeline and we are NOT already,
|
||||
* in finalize, tell the OOB to abort.
|
||||
* NOTE: we cannot call abort from here as the OOB needs to first
|
||||
@ -712,6 +722,23 @@ static int route_lost(const orte_process_name_t *route)
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
/* if we are the HNP or a daemon, is it a daemon, and one of my children? if so, then
|
||||
* remove it from the child list
|
||||
*/
|
||||
if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
|
||||
route->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
if (child->vpid == route->vpid) {
|
||||
opal_list_remove_item(&my_children, item);
|
||||
OBJ_RELEASE(item);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* we don't care about this one, so return success */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -904,6 +931,14 @@ static int get_wireup_info(opal_buffer_t *buf)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static size_t num_routes(void)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s num routes %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)opal_list_get_size(&my_children)));
|
||||
return opal_list_get_size(&my_children);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int binomial_ft_event(int state)
|
||||
|
@ -49,6 +49,7 @@ static int update_routing_tree(void);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
static size_t num_routes(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int cm_ft_event(int state);
|
||||
@ -67,6 +68,7 @@ orte_routed_module_t orte_routed_cm_module = {
|
||||
update_routing_tree,
|
||||
get_routing_tree,
|
||||
get_wireup_info,
|
||||
num_routes,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
cm_ft_event
|
||||
#else
|
||||
@ -734,22 +736,8 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
||||
|
||||
static int route_lost(const orte_process_name_t *route)
|
||||
{
|
||||
/* if we are the HNP and lose a route, check to see if it is
|
||||
* to a daemon
|
||||
*/
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
if (ORTE_PROC_MY_NAME->jobid == route->jobid) {
|
||||
/* this was a daemon - notify the errmgr
|
||||
* so we can take appropriate recovery, if desired
|
||||
*/
|
||||
opal_output(0, "%s routed:cm: daemon %s has died",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(route->vpid));
|
||||
orte_errmgr.update_state(route->jobid, ORTE_JOB_STATE_COMM_FAILED,
|
||||
(orte_process_name_t*)route,
|
||||
ORTE_PROC_STATE_COMM_FAILED, 0, 1);
|
||||
}
|
||||
/* either way, take no further action */
|
||||
/* take no further action */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -890,6 +878,23 @@ static int get_wireup_info(opal_buffer_t *buf)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static size_t num_routes(void)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* if I am the HNP, then the number of routes is
|
||||
* the number of daemons (other than me) still alive
|
||||
*/
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return 0;
|
||||
}
|
||||
return (jdata->num_procs - jdata->num_terminated - 1);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int cm_ft_event(int state)
|
||||
|
@ -40,6 +40,7 @@ static int update_routing_tree(void);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
static size_t num_routes(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int direct_ft_event(int state);
|
||||
@ -58,6 +59,7 @@ orte_routed_module_t orte_routed_direct_module = {
|
||||
update_routing_tree,
|
||||
get_routing_tree,
|
||||
get_wireup_info,
|
||||
num_routes,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
direct_ft_event
|
||||
#else
|
||||
@ -336,6 +338,24 @@ static int get_wireup_info(opal_buffer_t *buf)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static size_t num_routes(void)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* if I am the HNP, then the number of routes is
|
||||
* the number of daemons still alive (other than me)
|
||||
*/
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (jdata->num_procs - jdata->num_terminated - 1);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int direct_ft_event(int state)
|
||||
|
@ -45,6 +45,7 @@ static int update_routing_tree(void);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
static size_t num_routes(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int linear_ft_event(int state);
|
||||
@ -63,6 +64,7 @@ orte_routed_module_t orte_routed_linear_module = {
|
||||
update_routing_tree,
|
||||
get_routing_tree,
|
||||
get_wireup_info,
|
||||
num_routes,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
linear_ft_event
|
||||
#else
|
||||
@ -777,6 +779,11 @@ static int get_wireup_info(opal_buffer_t *buf)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static size_t num_routes(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int linear_ft_event(int state)
|
||||
{
|
||||
|
@ -46,6 +46,7 @@ static int update_routing_tree(void);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
static size_t num_routes(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int radix_ft_event(int state);
|
||||
@ -64,6 +65,7 @@ orte_routed_module_t orte_routed_radix_module = {
|
||||
update_routing_tree,
|
||||
get_routing_tree,
|
||||
get_wireup_info,
|
||||
num_routes,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
radix_ft_event
|
||||
#else
|
||||
@ -687,6 +689,9 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
||||
|
||||
static int route_lost(const orte_process_name_t *route)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_routed_tree_t *child;
|
||||
|
||||
/* if we lose the connection to the lifeline and we are NOT already,
|
||||
* in finalize, tell the OOB to abort.
|
||||
* NOTE: we cannot call abort from here as the OOB needs to first
|
||||
@ -701,6 +706,23 @@ static int route_lost(const orte_process_name_t *route)
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
/* if we are the HNP or daemon, and the route is a daemon,
|
||||
* see if it is one of our children - if so, remove it
|
||||
*/
|
||||
if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
|
||||
route->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
if (child->vpid == route->vpid) {
|
||||
opal_list_remove_item(&my_children, item);
|
||||
OBJ_RELEASE(item);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* we don't care about this one, so return success */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -896,6 +918,11 @@ static int get_wireup_info(opal_buffer_t *buf)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static size_t num_routes(void)
|
||||
{
|
||||
return opal_list_get_size(&my_children);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int radix_ft_event(int state)
|
||||
{
|
||||
|
@ -215,6 +215,13 @@ typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *chi
|
||||
*/
|
||||
typedef int (*orte_routed_module_set_lifeline_fn_t)(orte_process_name_t *proc);
|
||||
|
||||
/*
|
||||
* Get the number of routes supported by this process
|
||||
*
|
||||
* Returns the size of the routing tree using an O(1) function
|
||||
*/
|
||||
typedef size_t (*orte_routed_module_num_routes_fn_t)(void);
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
@ -251,6 +258,7 @@ struct orte_routed_module_t {
|
||||
orte_routed_module_update_routing_tree_fn_t update_routing_tree;
|
||||
orte_routed_module_get_routing_tree_fn_t get_routing_tree;
|
||||
orte_routed_module_get_wireup_info_fn_t get_wireup_info;
|
||||
orte_routed_module_num_routes_fn_t num_routes;
|
||||
/* FT Notification */
|
||||
orte_routed_module_ft_event_fn_t ft_event;
|
||||
};
|
||||
|
@ -41,6 +41,7 @@ static int update_routing_tree(void);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
static size_t num_routes(void);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int slave_ft_event(int state);
|
||||
@ -59,6 +60,7 @@ orte_routed_module_t orte_routed_slave_module = {
|
||||
update_routing_tree,
|
||||
get_routing_tree,
|
||||
get_wireup_info,
|
||||
num_routes,
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
slave_ft_event
|
||||
#else
|
||||
@ -288,6 +290,10 @@ static int get_wireup_info(opal_buffer_t *buf)
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static size_t num_routes(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
static int slave_ft_event(int state)
|
||||
|
@ -73,6 +73,7 @@
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
@ -638,16 +639,12 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
||||
opal_output(0, "%s orted_cmd: received exit cmd",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
/* if we are the HNP, just kill our local procs */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
return ORTE_SUCCESS;
|
||||
/* kill the local procs */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
/* if all our dependent routes are gone, exit */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
orte_quit();
|
||||
}
|
||||
|
||||
/* else we are a daemon, trigger our exit - we will kill our
|
||||
* local procs on our way out
|
||||
*/
|
||||
orte_trigger_event(&orte_exit);
|
||||
return ORTE_SUCCESS;
|
||||
break;
|
||||
|
||||
@ -661,7 +658,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
||||
* NOTE: this event will fire -after- any zero-time events
|
||||
* so any pending relays -do- get sent first
|
||||
*/
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_quit();
|
||||
return ORTE_SUCCESS;
|
||||
break;
|
||||
|
||||
|
@ -22,6 +22,7 @@
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
@ -61,13 +62,11 @@
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/sysinfo/sysinfo.h"
|
||||
|
||||
#include "orte/constants.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -87,30 +86,16 @@
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
|
||||
static opal_event_t term_handler;
|
||||
static opal_event_t int_handler;
|
||||
static opal_event_t pipe_handler;
|
||||
static opal_event_t epipe_handler;
|
||||
#ifndef __WINDOWS__
|
||||
static opal_event_t sigusr1_handler;
|
||||
static opal_event_t sigusr2_handler;
|
||||
#endif /* __WINDOWS__ */
|
||||
char *log_path = NULL;
|
||||
static opal_event_t *orted_exit_event;
|
||||
static bool signals_set=false;
|
||||
|
||||
static void shutdown_callback(int fd, short flags, void *arg);
|
||||
static void shutdown_signal(int fd, short flags, void *arg);
|
||||
static void signal_callback(int fd, short event, void *arg);
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg);
|
||||
|
||||
static struct {
|
||||
bool debug;
|
||||
@ -221,10 +206,7 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
int orte_daemon(int argc, char *argv[])
|
||||
{
|
||||
int ret = 0;
|
||||
int fd;
|
||||
opal_cmd_line_t *cmd_line = NULL;
|
||||
char log_file[PATH_MAX];
|
||||
char *jobidstring;
|
||||
char *rml_uri;
|
||||
int i;
|
||||
opal_buffer_t *buffer;
|
||||
@ -286,9 +268,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup the exit triggers */
|
||||
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
|
||||
|
||||
/* save the environment for launch purposes. This MUST be
|
||||
* done so that we can pass it to any local procs we
|
||||
* spawn - otherwise, those local procs won't see any
|
||||
@ -373,7 +352,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
* and have it kill us
|
||||
*/
|
||||
if (0 < orted_globals.fail_delay) {
|
||||
ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_signal);
|
||||
ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback);
|
||||
|
||||
} else {
|
||||
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -420,17 +399,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
}
|
||||
|
||||
/* setup an event we can wait for to tell
|
||||
* us to terminate - both normal and abnormal
|
||||
* termination will call us here. Use the same exit
|
||||
* fd as orterun so that orte_comm can wake either of us up
|
||||
* since we share that code
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_wait_event(&orted_exit_event, &orte_exit, "orted_shutdown", shutdown_callback))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
/* setup the primary daemon command receive function */
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
|
||||
ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
|
||||
@ -439,69 +407,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
#ifndef __WINDOWS__
|
||||
/* setup callback for SIGPIPE */
|
||||
opal_signal_set(&epipe_handler, SIGPIPE,
|
||||
epipe_signal_callback, &epipe_handler);
|
||||
opal_signal_add(&epipe_handler, NULL);
|
||||
/* Set signal handlers to catch kill signals so we can properly clean up
|
||||
* after ourselves.
|
||||
*/
|
||||
opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL,
|
||||
shutdown_signal, NULL);
|
||||
opal_event_add(&term_handler, NULL);
|
||||
opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL,
|
||||
shutdown_signal, NULL);
|
||||
opal_event_add(&int_handler, NULL);
|
||||
|
||||
/** setup callbacks for signals we should ignore */
|
||||
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
||||
signal_callback, &sigusr1_handler);
|
||||
opal_signal_add(&sigusr1_handler, NULL);
|
||||
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
||||
signal_callback, &sigusr2_handler);
|
||||
opal_signal_add(&sigusr2_handler, NULL);
|
||||
#endif /* __WINDOWS__ */
|
||||
|
||||
signals_set = true;
|
||||
|
||||
/* setup stdout/stderr */
|
||||
if (orte_debug_daemons_file_flag) {
|
||||
/* if we are debugging to a file, then send stdout/stderr to
|
||||
* the orted log file
|
||||
*/
|
||||
|
||||
/* get my jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
|
||||
ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
/* define a log file name in the session directory */
|
||||
snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
|
||||
jobidstring, orte_process_info.nodename);
|
||||
log_path = opal_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
log_file,
|
||||
NULL);
|
||||
|
||||
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
|
||||
if (fd < 0) {
|
||||
/* couldn't open the file for some reason, so
|
||||
* just connect everything to /dev/null
|
||||
*/
|
||||
fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
|
||||
} else {
|
||||
dup2(fd, STDOUT_FILENO);
|
||||
dup2(fd, STDERR_FILENO);
|
||||
if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
|
||||
close(fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* output a message indicating we are alive, our name, and our pid
|
||||
* for debugging purposes
|
||||
*/
|
||||
@ -800,41 +705,13 @@ int orte_daemon(int argc, char *argv[])
|
||||
|
||||
/* should never get here, but if we do... */
|
||||
DONE:
|
||||
if (signals_set) {
|
||||
/* Release all local signal handlers */
|
||||
opal_event_del(&term_handler);
|
||||
opal_event_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
|
||||
/* cleanup any lingering session directories */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
|
||||
/* Finalize and clean up ourselves */
|
||||
orte_finalize();
|
||||
orte_quit();
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void shutdown_signal(int fd, short flags, void *arg)
|
||||
{
|
||||
/* trigger the call to shutdown callback to protect
|
||||
* against race conditions - the trigger event will
|
||||
* check the one-time lock
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
orte_trigger_event(&orte_exit);
|
||||
}
|
||||
|
||||
static void shutdown_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (NULL != arg) {
|
||||
/* it's the singleton pipe... remove that handler */
|
||||
opal_event_del(&pipe_handler);
|
||||
@ -844,27 +721,14 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
opal_output(0, "%s orted: finalizing", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != log_path) {
|
||||
unlink(log_path);
|
||||
}
|
||||
|
||||
/* make sure our local procs are dead */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
|
||||
/* if we were ordered to abort, do so */
|
||||
if (orted_globals.abort) {
|
||||
opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
/* do -not- call finalize as this will send a message to the HNP
|
||||
* indicating clean termination! Instead, just forcibly cleanup
|
||||
* the local session_dir tree and abort
|
||||
* indicating clean termination! Instead, just kill our
|
||||
* local procs, forcibly cleanup the local session_dir tree, and abort
|
||||
*/
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
abort();
|
||||
} else if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
|
||||
@ -873,38 +737,10 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
* indicating clean termination! Instead, just forcibly cleanup
|
||||
* the local session_dir tree and exit
|
||||
*/
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
if (signals_set) {
|
||||
/* Release all local signal handlers */
|
||||
opal_event_del(&epipe_handler);
|
||||
opal_event_del(&term_handler);
|
||||
opal_event_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
|
||||
/* Finalize and clean up ourselves */
|
||||
ret = orte_finalize();
|
||||
exit(orte_exit_status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deal with sigpipe errors
|
||||
*/
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
/* for now, we just announce and ignore them */
|
||||
opal_output(0, "%s reports a SIGPIPE error on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd);
|
||||
return;
|
||||
}
|
||||
|
||||
static void signal_callback(int fd, short event, void *arg)
|
||||
{
|
||||
/* just ignore these signals */
|
||||
orte_quit();
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ headers += \
|
||||
runtime/runtime.h \
|
||||
runtime/orte_locks.h \
|
||||
runtime/orte_globals.h \
|
||||
runtime/orte_quit.h \
|
||||
runtime/runtime_internals.h \
|
||||
runtime/data_type_support/orte_dt_support.h
|
||||
|
||||
@ -34,6 +35,7 @@ libopen_rte_la_SOURCES += \
|
||||
runtime/orte_init.c \
|
||||
runtime/orte_locks.c \
|
||||
runtime/orte_globals.c \
|
||||
runtime/orte_quit.c \
|
||||
runtime/data_type_support/orte_dt_compare_fns.c \
|
||||
runtime/data_type_support/orte_dt_copy_fns.c \
|
||||
runtime/data_type_support/orte_dt_print_fns.c \
|
||||
|
@ -753,6 +753,13 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the constrain flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->constrain)), 1, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -526,9 +526,10 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d",
|
||||
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d\tConstrain: %s",
|
||||
pfx2, src->name, (unsigned long)src->idx, src->app,
|
||||
pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
|
||||
pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts,
|
||||
src->constrain ? "TRUE" : "FALSE");
|
||||
|
||||
count = opal_argv_count(src->argv);
|
||||
for (i=0; i < count; i++) {
|
||||
|
@ -831,6 +831,13 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the constrain flag */
|
||||
max_n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->constrain,
|
||||
&max_n, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -60,6 +60,7 @@ bool orte_do_not_launch = false;
|
||||
bool orted_spin_flag = false;
|
||||
bool orte_daemon_bootstrap = false;
|
||||
char *orte_local_cpu_model = NULL;
|
||||
char *orte_basename = NULL;
|
||||
|
||||
/* ORTE OOB port flags */
|
||||
bool orte_static_ports = false;
|
||||
@ -89,8 +90,7 @@ bool orte_output_debugger_proctable=false;
|
||||
char *orte_debugger_test_daemon=NULL;
|
||||
bool orte_debugger_test_attach=false;
|
||||
|
||||
/* exit triggers and flags */
|
||||
orte_trigger_event_t orte_exit, orteds_exit;
|
||||
/* exit flags */
|
||||
int orte_exit_status = 0;
|
||||
bool orte_abnormal_term_ordered = false;
|
||||
bool orte_routing_is_enabled = false;
|
||||
@ -144,9 +144,6 @@ bool orte_forward_job_control;
|
||||
char *orte_rsh_agent = NULL;
|
||||
bool orte_assume_same_shell = true;
|
||||
|
||||
/* orted exit with barrier */
|
||||
bool orte_orted_exit_with_barrier = true;
|
||||
|
||||
/* report launch progress */
|
||||
bool orte_report_launch_progress = false;
|
||||
|
||||
@ -535,6 +532,7 @@ static void orte_app_context_construct(orte_app_context_t* app_context)
|
||||
app_context->used_on_node = false;
|
||||
app_context->max_local_restarts = -1;
|
||||
app_context->max_global_restarts = -1;
|
||||
app_context->constrain = true;
|
||||
}
|
||||
|
||||
static void orte_app_context_destructor(orte_app_context_t* app_context)
|
||||
|
@ -217,6 +217,11 @@ typedef struct {
|
||||
int32_t max_local_restarts;
|
||||
/* max number of times a process can be relocated to another node */
|
||||
int32_t max_global_restarts;
|
||||
/* whether or not the procs in this app are constrained to stay
|
||||
* on the specified nodes when restarted, or can move to any
|
||||
* known node
|
||||
*/
|
||||
bool constrain;
|
||||
} orte_app_context_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
|
||||
@ -287,17 +292,18 @@ typedef struct {
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
|
||||
|
||||
/* define a set of flags to control the launch of a job */
|
||||
typedef uint8_t orte_job_controls_t;
|
||||
#define ORTE_JOB_CONTROL OPAL_UINT8
|
||||
typedef uint16_t orte_job_controls_t;
|
||||
#define ORTE_JOB_CONTROL OPAL_UINT16
|
||||
|
||||
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x01
|
||||
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x02
|
||||
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x14
|
||||
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x08
|
||||
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x10
|
||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
|
||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
|
||||
#define ORTE_JOB_CONTROL_RECOVERABLE 0x80
|
||||
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x0001
|
||||
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002
|
||||
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014
|
||||
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008
|
||||
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010
|
||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020
|
||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040
|
||||
#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080
|
||||
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
|
||||
|
||||
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
||||
/* put the rank assignment method in the upper 8 bits */
|
||||
@ -567,6 +573,7 @@ ORTE_DECLSPEC extern bool orte_do_not_launch;
|
||||
ORTE_DECLSPEC extern bool orted_spin_flag;
|
||||
ORTE_DECLSPEC extern bool orte_daemon_bootstrap;
|
||||
ORTE_DECLSPEC extern char *orte_local_cpu_model;
|
||||
ORTE_DECLSPEC extern char *orte_basename;
|
||||
|
||||
/* ORTE OOB port flags */
|
||||
ORTE_DECLSPEC extern bool orte_static_ports;
|
||||
@ -596,9 +603,7 @@ ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
|
||||
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
|
||||
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
|
||||
|
||||
/* exit triggers and flags */
|
||||
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;
|
||||
ORTE_DECLSPEC extern orte_trigger_event_t orteds_exit;
|
||||
/* exit flags */
|
||||
ORTE_DECLSPEC extern int orte_exit_status;
|
||||
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
|
||||
ORTE_DECLSPEC extern bool orte_routing_is_enabled;
|
||||
@ -652,9 +657,6 @@ ORTE_DECLSPEC extern char *orte_xterm;
|
||||
ORTE_DECLSPEC extern char *orte_rsh_agent;
|
||||
ORTE_DECLSPEC extern bool orte_assume_same_shell;
|
||||
|
||||
/* whether or not to barrier the orteds upon exit */
|
||||
ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
||||
|
||||
/* whether or not to report launch progress */
|
||||
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
||||
|
||||
|
@ -27,7 +27,8 @@ opal_atomic_lock_t orte_finalize_lock;
|
||||
|
||||
/* for HNPs */
|
||||
opal_atomic_lock_t orte_abort_inprogress_lock;
|
||||
|
||||
opal_atomic_lock_t orte_jobs_complete_lock;
|
||||
opal_atomic_lock_t orte_quit_lock;
|
||||
|
||||
int orte_locks_init(void)
|
||||
{
|
||||
@ -36,6 +37,8 @@ int orte_locks_init(void)
|
||||
|
||||
/* for HNPs */
|
||||
opal_atomic_init(&orte_abort_inprogress_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
opal_atomic_init(&orte_jobs_complete_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
opal_atomic_init(&orte_quit_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -35,7 +35,8 @@ ORTE_DECLSPEC extern opal_atomic_lock_t orte_finalize_lock;
|
||||
|
||||
/* for HNPs */
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_abort_inprogress_lock;
|
||||
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_jobs_complete_lock;
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_quit_lock;
|
||||
|
||||
/**
|
||||
* Initialize the locks
|
||||
|
401
orte/runtime/orte_quit.c
Обычный файл
401
orte/runtime/orte_quit.c
Обычный файл
@ -0,0 +1,401 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
|
||||
static int num_aborted = 0;
|
||||
static int num_killed = 0;
|
||||
static int num_failed_start = 0;
|
||||
|
||||
static void dump_aborted_procs(void);
|
||||
|
||||
void orte_jobs_complete(void)
|
||||
{
|
||||
/* check one-time lock to protect against multiple calls */
|
||||
if (!opal_atomic_trylock(&orte_jobs_complete_lock)) { /* returns 1 if already locked */
|
||||
return;
|
||||
}
|
||||
|
||||
/* if we never launched, just skip this part to avoid
|
||||
* meaningless error messages
|
||||
*/
|
||||
if (orte_never_launched) {
|
||||
ORTE_UPDATE_EXIT_STATUS(orte_exit_status);
|
||||
orte_quit();
|
||||
}
|
||||
|
||||
if (0 != orte_exit_status && !orte_execute_quiet) {
|
||||
/* abnormal termination of some kind */
|
||||
dump_aborted_procs();
|
||||
/* If we showed more abort messages than were allowed,
|
||||
show a followup message here */
|
||||
if (num_failed_start > 1) {
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "<stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "%d total process%s failed to start",
|
||||
num_failed_start, ((num_failed_start > 1) ? "es" : ""));
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "
</stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "\n");
|
||||
}
|
||||
if (num_aborted > 1) {
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "<stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "%d total process%s aborted",
|
||||
num_aborted, ((num_aborted > 1) ? "es" : ""));
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "
</stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "\n");
|
||||
}
|
||||
if (num_killed > 1) {
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "<stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
|
||||
num_killed, ((num_killed > 1) ? "es" : ""), orte_basename);
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "
</stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
if (0 < orte_routed.num_routes()) {
|
||||
orte_plm.terminate_orteds();
|
||||
}
|
||||
}
|
||||
|
||||
void orte_quit(void)
|
||||
{
|
||||
/* check one-time lock to protect against "bounce" */
|
||||
if (!opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */
|
||||
return;
|
||||
}
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* cleanup and leave */
|
||||
orte_finalize();
|
||||
|
||||
if (NULL != orte_basename) {
|
||||
free(orte_basename);
|
||||
}
|
||||
|
||||
if (orte_debug_flag) {
|
||||
fprintf(stderr, "orterun: exiting with status %d\n", orte_exit_status);
|
||||
}
|
||||
exit(orte_exit_status);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* On abnormal termination - dump the
|
||||
* exit status of the aborted procs.
|
||||
*/
|
||||
|
||||
static void dump_aborted_procs(void)
|
||||
{
|
||||
orte_std_cntr_t i, n;
|
||||
orte_proc_t *proc, *pptr;
|
||||
orte_app_context_t *app, *approc;
|
||||
orte_job_t *job;
|
||||
orte_node_t *node;
|
||||
|
||||
/* find the job that caused the problem - be sure to start the loop
|
||||
* at 1 as the daemons are in 0 and will clearly be "running", so no
|
||||
* point in checking them
|
||||
*/
|
||||
for (n=1; n < orte_job_data->size; n++) {
|
||||
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
|
||||
/* the array is no longer left-justified, so we have to continue */
|
||||
continue;
|
||||
}
|
||||
if (ORTE_JOB_STATE_UNDEF != job->state &&
|
||||
ORTE_JOB_STATE_INIT != job->state &&
|
||||
ORTE_JOB_STATE_LAUNCHED != job->state &&
|
||||
ORTE_JOB_STATE_RUNNING != job->state &&
|
||||
ORTE_JOB_STATE_TERMINATED != job->state &&
|
||||
ORTE_JOB_STATE_ABORT_ORDERED != job->state) {
|
||||
/* this is a guilty party */
|
||||
proc = job->aborted_proc;
|
||||
/* always must be at least one app */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, 0);
|
||||
/* cycle through and count the number that were killed or aborted */
|
||||
for (i=0; i < job->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
|
||||
/* array is left-justfied - we are done */
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state) {
|
||||
++num_failed_start;
|
||||
} else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
|
||||
++num_aborted;
|
||||
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
|
||||
++num_killed;
|
||||
}
|
||||
}
|
||||
approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
|
||||
node = proc->node;
|
||||
if (ORTE_JOB_STATE_FAILED_TO_START == job->state) {
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true,
|
||||
orte_basename);
|
||||
return;
|
||||
}
|
||||
switch (OPAL_SOS_GET_ERROR_CODE(proc->exit_code)) {
|
||||
case ORTE_ERR_SYS_LIMITS_PIPES:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
||||
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_WDIR_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
||||
orte_basename, approc->cwd,
|
||||
proc->node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
||||
orte_basename,
|
||||
(unsigned long)proc->name.vpid,
|
||||
orte_basename,
|
||||
orte_basename,
|
||||
proc->node->name,
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
||||
orte_basename, approc->app, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:multiple-paffinity-schemes", true, proc->slot_list);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
||||
proc->slot_list, approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_NODE_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-node-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_LOCAL_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-local-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_CORES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-core", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-core", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_PHYS_CPU:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-phys-cpu", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-socket", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-socket", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_MODULE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:paffinity-missing-module",
|
||||
true, node->name);
|
||||
break;
|
||||
case ORTE_ERR_SLOT_LIST_RANGE:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-slot-list-range",
|
||||
true, node->name, proc->slot_list);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_READ_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
||||
orte_basename, node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (0 != proc->exit_code) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||
orte_basename, node->name);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true,
|
||||
orte_basename);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orte_basename);
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-signal-unknown", true,
|
||||
orte_basename);
|
||||
} else {
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code),
|
||||
strsignal(WTERMSIG(proc->exit_code)));
|
||||
} else {
|
||||
#endif
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code));
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true,
|
||||
orte_basename, orte_basename);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orte_basename, orte_basename);
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
||||
switch (proc->exit_code) {
|
||||
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
break;
|
||||
case ORTE_ERR_PROC_STALLED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
||||
break;
|
||||
|
||||
default:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
||||
break;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true,
|
||||
orte_basename,
|
||||
(0 == strncmp("orte", orte_basename, 4)) ? "orte" : "MPI");
|
||||
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
29
orte/runtime/orte_quit.h
Обычный файл
29
orte/runtime/orte_quit.h
Обычный файл
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ORTE_QUIT_H
|
||||
#define ORTE_QUIT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC void orte_jobs_complete(void);
|
||||
|
||||
ORTE_DECLSPEC void orte_quit(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_CR_H */
|
@ -56,6 +56,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
/*
|
||||
* Local variables & functions
|
||||
@ -68,7 +69,6 @@ static bool all_recvd;
|
||||
static int32_t num_replies;
|
||||
static int32_t num_recvd;
|
||||
static opal_buffer_t cmdbuf;
|
||||
static opal_event_t *my_exit_event;
|
||||
static FILE *fp = NULL;
|
||||
static bool help;
|
||||
static char *hnppidstr;
|
||||
@ -181,7 +181,7 @@ static void send_cmd(int fd, short dummy, void *arg)
|
||||
num_recvd = 0;
|
||||
if (0 > (ret = orte_rml.send_buffer(&(target_hnp->name), &cmdbuf, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
orte_trigger_event(&orteds_exit);
|
||||
orte_quit();
|
||||
return;
|
||||
}
|
||||
|
||||
@ -197,7 +197,7 @@ static void send_cmd(int fd, short dummy, void *arg)
|
||||
if (0 < update_rate) {
|
||||
ORTE_TIMER_EVENT(update_rate, 0, send_cmd);
|
||||
} else {
|
||||
orte_trigger_event(&orte_exit);
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
|
||||
@ -263,14 +263,7 @@ main(int argc, char *argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
|
||||
|
||||
if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) {
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup the list for recvd stats */
|
||||
/* setup the list for recvd stats */
|
||||
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
|
||||
|
||||
/** setup callbacks for abort signals - from this point
|
||||
@ -567,8 +560,8 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
if (NULL != fp && fp != stdout) {
|
||||
fclose(fp);
|
||||
}
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
orte_quit();
|
||||
}
|
||||
|
||||
static void process_stats(int fd, short event, void *data)
|
||||
|
@ -92,6 +92,7 @@
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
/* ensure I can behave like a daemon */
|
||||
#include "orte/orted/orted.h"
|
||||
@ -101,31 +102,13 @@
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
static struct opal_event term_handler;
|
||||
static struct opal_event int_handler;
|
||||
static struct opal_event epipe_handler;
|
||||
#ifndef __WINDOWS__
|
||||
static struct opal_event sigusr1_handler;
|
||||
static struct opal_event sigusr2_handler;
|
||||
static struct opal_event sigtstp_handler;
|
||||
static struct opal_event sigcont_handler;
|
||||
#endif /* __WINDOWS__ */
|
||||
static orte_job_t *jdata=NULL;
|
||||
static char *orterun_basename = NULL;
|
||||
static int num_aborted = 0;
|
||||
static int num_killed = 0;
|
||||
static int num_failed_start = 0;
|
||||
static char **global_mca_env = NULL;
|
||||
static bool have_zero_np = false;
|
||||
static orte_std_cntr_t total_num_apps = 0;
|
||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
||||
static opal_event_t *orterun_event=NULL, *orteds_exit_event=NULL;
|
||||
static char *ompi_server=NULL;
|
||||
static opal_event_t *abort_exit_event=NULL;
|
||||
static bool forcibly_die = false;
|
||||
static opal_event_t *timeout_ev=NULL;
|
||||
static bool profile_is_set = false;
|
||||
static bool signals_set=false;
|
||||
|
||||
/*
|
||||
* Globals
|
||||
@ -145,7 +128,7 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be verbose" },
|
||||
{ "orte", "execute", "quiet", 'q', NULL, "quiet", 0,
|
||||
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages" },
|
||||
{ NULL, NULL, NULL, '\0', "report-pid", "report-pid", 1,
|
||||
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING,
|
||||
@ -455,20 +438,12 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static void job_completed(int trigpipe, short event, void *arg);
|
||||
static void abort_signal_callback(int fd, short flags, void *arg);
|
||||
static void abort_exit_callback(int fd, short event, void *arg);
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg);
|
||||
static void signal_forward_callback(int fd, short event, void *arg);
|
||||
static int create_app(int argc, char* argv[], orte_app_context_t **app,
|
||||
bool *made_app, char ***app_env);
|
||||
static int init_globals(void);
|
||||
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line);
|
||||
static int parse_locals(int argc, char* argv[]);
|
||||
static int parse_appfile(char *filename, char ***env);
|
||||
static void dump_aborted_procs(void);
|
||||
static void just_quit(int fd, short ign, void *arg);
|
||||
|
||||
static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
int argc, char *argv[], int num_procs);
|
||||
|
||||
@ -480,7 +455,7 @@ int orterun(int argc, char *argv[])
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
orterun_basename = opal_basename(argv[0]);
|
||||
orte_basename = opal_basename(argv[0]);
|
||||
|
||||
/* Setup and parse the command line */
|
||||
init_globals();
|
||||
@ -525,10 +500,6 @@ int orterun(int argc, char *argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup the exit triggers */
|
||||
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
|
||||
OBJ_CONSTRUCT(&orteds_exit, orte_trigger_event_t);
|
||||
|
||||
/* flag that I am the HNP - needs to be done prior to
|
||||
* registering params
|
||||
*/
|
||||
@ -582,7 +553,7 @@ int orterun(int argc, char *argv[])
|
||||
/* This should never happen -- this case should be caught in
|
||||
create_app(), but let's just double check... */
|
||||
orte_show_help("help-orterun.txt", "orterun:nothing-to-do",
|
||||
true, orterun_basename);
|
||||
true, orte_basename);
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
@ -634,7 +605,7 @@ int orterun(int argc, char *argv[])
|
||||
fp = fopen(orterun_globals.report_uri, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orterun_basename, "uri", orterun_globals.report_uri);
|
||||
orte_basename, "uri", orterun_globals.report_uri);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
@ -655,68 +626,6 @@ int orterun(int argc, char *argv[])
|
||||
but what the heck... :-) */
|
||||
opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
|
||||
|
||||
/* setup an event we can wait for that will tell
|
||||
* us to terminate - both normal and abnormal
|
||||
* termination will call us here. Use the
|
||||
* same exit fd as the daemon does so that orted_comm
|
||||
* can cause either of us to exit since we share that code
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, "job_complete", job_completed))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(rc));
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
/* setup an event that will
|
||||
* trigger when the orteds are gone and tell the orteds that it is
|
||||
* okay to finalize and exit, we are done with them.
|
||||
* We set this up here in order to provide a way for us to
|
||||
* wakeup and terminate should the daemons themselves fail to launch,
|
||||
* and before we define signal handlers since they will call the
|
||||
* exit event trigger!
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, "orted_exit", just_quit))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(rc));
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
#ifndef __WINDOWS__
|
||||
/* setup callback for SIGPIPE */
|
||||
opal_signal_set(&epipe_handler, SIGPIPE,
|
||||
epipe_signal_callback, &epipe_handler);
|
||||
opal_signal_add(&epipe_handler, NULL);
|
||||
/** setup callbacks for abort signals - from this point
|
||||
* forward, we need to abort in a manner that allows us
|
||||
* to cleanup
|
||||
*/
|
||||
opal_signal_set(&term_handler, SIGTERM,
|
||||
abort_signal_callback, &term_handler);
|
||||
opal_signal_add(&term_handler, NULL);
|
||||
opal_signal_set(&int_handler, SIGINT,
|
||||
abort_signal_callback, &int_handler);
|
||||
opal_signal_add(&int_handler, NULL);
|
||||
|
||||
/** setup callbacks for signals we should foward */
|
||||
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
||||
signal_forward_callback, &sigusr1_handler);
|
||||
opal_signal_add(&sigusr1_handler, NULL);
|
||||
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
||||
signal_forward_callback, &sigusr2_handler);
|
||||
opal_signal_add(&sigusr2_handler, NULL);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_set(&sigtstp_handler, SIGTSTP,
|
||||
signal_forward_callback, &sigtstp_handler);
|
||||
opal_signal_add(&sigtstp_handler, NULL);
|
||||
opal_signal_set(&sigcont_handler, SIGCONT,
|
||||
signal_forward_callback, &sigcont_handler);
|
||||
opal_signal_add(&sigcont_handler, NULL);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
|
||||
signals_set = true;
|
||||
|
||||
/* If we have a prefix, then modify the PATH and
|
||||
LD_LIBRARY_PATH environment variables in our copy. This
|
||||
will ensure that any locally-spawned children will
|
||||
@ -743,7 +652,7 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
opal_setenv("PATH", newenv, true, &orte_launch_environ);
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "%s: reset PATH: %s", orterun_basename, newenv);
|
||||
opal_output(0, "%s: reset PATH: %s", orte_basename, newenv);
|
||||
}
|
||||
free(newenv);
|
||||
free(bin_base);
|
||||
@ -760,7 +669,7 @@ int orterun(int argc, char *argv[])
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ);
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "%s: reset LD_LIBRARY_PATH: %s",
|
||||
orterun_basename, newenv);
|
||||
orte_basename, newenv);
|
||||
}
|
||||
free(newenv);
|
||||
free(lib_base);
|
||||
@ -770,7 +679,7 @@ int orterun(int argc, char *argv[])
|
||||
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orterun.txt", "orterun:precondition", false,
|
||||
orterun_basename, NULL, NULL, rc);
|
||||
orte_basename, NULL, NULL, rc);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto DONE;
|
||||
}
|
||||
@ -821,7 +730,7 @@ int orterun(int argc, char *argv[])
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.ping(ompi_server, &timeout))) {
|
||||
/* okay give up */
|
||||
orte_show_help("help-orterun.txt", "orterun:server-not-found", true,
|
||||
orterun_basename, ompi_server,
|
||||
orte_basename, ompi_server,
|
||||
(long)orterun_globals.server_wait_timeout,
|
||||
ORTE_ERROR_NAME(rc));
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
@ -848,524 +757,10 @@ int orterun(int argc, char *argv[])
|
||||
*/
|
||||
DONE:
|
||||
ORTE_UPDATE_EXIT_STATUS(orte_exit_status);
|
||||
just_quit(0,0,NULL);
|
||||
orte_quit();
|
||||
return orte_exit_status;
|
||||
}
|
||||
|
||||
static void job_completed(int trigpipe, short event, void *arg)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *daemons;
|
||||
|
||||
/* if the abort exit event is set, delete it */
|
||||
if (NULL != abort_exit_event) {
|
||||
opal_evtimer_del(abort_exit_event);
|
||||
free(abort_exit_event);
|
||||
}
|
||||
|
||||
/* if we never launched, just skip this part to avoid
|
||||
* meaningless error messages
|
||||
*/
|
||||
if (orte_never_launched) {
|
||||
rc = orte_exit_status;
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
if (0 != orte_exit_status && !orterun_globals.quiet) {
|
||||
/* abnormal termination of some kind */
|
||||
dump_aborted_procs();
|
||||
/* If we showed more abort messages than were allowed,
|
||||
show a followup message here */
|
||||
if (num_failed_start > 1) {
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "<stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "%d total process%s failed to start",
|
||||
num_failed_start, ((num_failed_start > 1) ? "es" : ""));
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "
</stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "\n");
|
||||
}
|
||||
if (num_aborted > 1) {
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "<stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "%d total process%s aborted",
|
||||
num_aborted, ((num_aborted > 1) ? "es" : ""));
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "
</stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "\n");
|
||||
}
|
||||
if (num_killed > 1) {
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "<stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
|
||||
num_killed, ((num_killed > 1) ? "es" : ""), orterun_basename);
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "
</stderr>");
|
||||
}
|
||||
fprintf(orte_xml_fp, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
|
||||
/* since we know that the sends didn't completely go out,
|
||||
* we know that the barrier will never complete. Add a timeout so
|
||||
* that those daemons that can respond have a chance to do
|
||||
* so
|
||||
*/
|
||||
/* get the orted job data object */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
/* we are totally hozed */
|
||||
goto DONE;
|
||||
}
|
||||
ORTE_DETECT_TIMEOUT(&timeout_ev, daemons->num_procs,
|
||||
orte_timeout_usec_per_proc,
|
||||
orte_max_timeout, just_quit);
|
||||
}
|
||||
|
||||
/* ensure all the orteds depart together */
|
||||
orte_grpcomm.onesided_barrier();
|
||||
|
||||
DONE:
|
||||
ORTE_UPDATE_EXIT_STATUS(rc);
|
||||
just_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
static void just_quit(int fd, short ign, void *arg)
|
||||
{
|
||||
/* if the orted exit event is set, delete it */
|
||||
if (NULL != orteds_exit_event) {
|
||||
opal_evtimer_del(orteds_exit_event);
|
||||
free(orteds_exit_event);
|
||||
}
|
||||
|
||||
if (signals_set) {
|
||||
/* Remove the epipe handler */
|
||||
opal_signal_del(&epipe_handler);
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_del(&sigtstp_handler);
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* cleanup and leave */
|
||||
orte_finalize();
|
||||
|
||||
free(orterun_basename);
|
||||
if (orte_debug_flag) {
|
||||
fprintf(stderr, "orterun: exiting with status %d\n", orte_exit_status);
|
||||
}
|
||||
exit(orte_exit_status);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* On abnormal termination - dump the
|
||||
* exit status of the aborted procs.
|
||||
*/
|
||||
|
||||
static void dump_aborted_procs(void)
|
||||
{
|
||||
orte_std_cntr_t i, n;
|
||||
orte_proc_t *proc, *pptr;
|
||||
orte_app_context_t *app, *approc;
|
||||
orte_job_t *job;
|
||||
orte_node_t *node;
|
||||
|
||||
/* find the job that caused the problem - be sure to start the loop
|
||||
* at 1 as the daemons are in 0 and will clearly be "running", so no
|
||||
* point in checking them
|
||||
*/
|
||||
for (n=1; n < orte_job_data->size; n++) {
|
||||
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
|
||||
/* the array is no longer left-justified, so we have to continue */
|
||||
continue;
|
||||
}
|
||||
if (ORTE_JOB_STATE_UNDEF != job->state &&
|
||||
ORTE_JOB_STATE_INIT != job->state &&
|
||||
ORTE_JOB_STATE_LAUNCHED != job->state &&
|
||||
ORTE_JOB_STATE_RUNNING != job->state &&
|
||||
ORTE_JOB_STATE_TERMINATED != job->state &&
|
||||
ORTE_JOB_STATE_ABORT_ORDERED != job->state) {
|
||||
/* this is a guilty party */
|
||||
proc = job->aborted_proc;
|
||||
/* always must be at least one app */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, 0);
|
||||
/* cycle through and count the number that were killed or aborted */
|
||||
for (i=0; i < job->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
|
||||
/* array is left-justfied - we are done */
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state) {
|
||||
++num_failed_start;
|
||||
} else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
|
||||
++num_aborted;
|
||||
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
|
||||
++num_killed;
|
||||
}
|
||||
}
|
||||
approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
|
||||
node = proc->node;
|
||||
if (ORTE_JOB_STATE_FAILED_TO_START == job->state) {
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true,
|
||||
orterun_basename);
|
||||
return;
|
||||
}
|
||||
switch (OPAL_SOS_GET_ERROR_CODE(proc->exit_code)) {
|
||||
case ORTE_ERR_SYS_LIMITS_PIPES:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
||||
orterun_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
||||
orterun_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
||||
orterun_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
||||
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
||||
orterun_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_WDIR_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
||||
orterun_basename, approc->cwd,
|
||||
proc->node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
||||
orterun_basename,
|
||||
(unsigned long)proc->name.vpid,
|
||||
orterun_basename,
|
||||
orterun_basename,
|
||||
proc->node->name,
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
||||
orterun_basename, approc->app, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:multiple-paffinity-schemes", true, proc->slot_list);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
||||
proc->slot_list, approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_NODE_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-node-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_LOCAL_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-local-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_CORES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-core", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-core", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_PHYS_CPU:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-phys-cpu", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-socket", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-socket", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_MODULE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:paffinity-missing-module",
|
||||
true, node->name);
|
||||
break;
|
||||
case ORTE_ERR_SLOT_LIST_RANGE:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-slot-list-range",
|
||||
true, node->name, proc->slot_list);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_READ_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
||||
orterun_basename, node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (0 != proc->exit_code) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||
orterun_basename, node->name);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true,
|
||||
orterun_basename);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
|
||||
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orterun_basename);
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-signal-unknown", true,
|
||||
orterun_basename);
|
||||
} else {
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
|
||||
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code),
|
||||
strsignal(WTERMSIG(proc->exit_code)));
|
||||
} else {
|
||||
#endif
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
|
||||
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code));
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
|
||||
if (NULL == proc) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync-unknown", true,
|
||||
orterun_basename, orterun_basename);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
|
||||
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orterun_basename, orterun_basename);
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
||||
switch (proc->exit_code) {
|
||||
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
break;
|
||||
case ORTE_ERR_PROC_STALLED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
||||
break;
|
||||
|
||||
default:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
||||
break;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true,
|
||||
orterun_basename,
|
||||
(0 == strncmp("orte", orterun_basename, 4)) ? "orte" : "MPI");
|
||||
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
||||
orterun_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
|
||||
|
||||
/* since we are being terminated by a user's signal, be
|
||||
* sure to exit with a non-zero exit code - but don't
|
||||
* overwrite any error code from a proc that might have
|
||||
* failed, in case that is why the user ordered us
|
||||
* to terminate
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
|
||||
/* terminate the job - this will also wakeup orterun so
|
||||
* it can report to the user and kill all the orteds.
|
||||
* Check the jobid, though, just in case the user
|
||||
* hit ctrl-c before we had a chance to setup the
|
||||
* job in the system - in which case there is nothing
|
||||
* to terminate!
|
||||
*/
|
||||
if (NULL != jdata &&
|
||||
jdata->jobid != ORTE_JOBID_INVALID &&
|
||||
!orte_never_launched) {
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
/*
|
||||
* Turn off the process recovery functionality, if it was enabled.
|
||||
* This keeps the errmgr from trying to recover from the shutdown
|
||||
* procedure.
|
||||
*/
|
||||
orte_enable_recovery = false;
|
||||
|
||||
/* terminate the orteds - they will automatically kill
|
||||
* their local procs
|
||||
*/
|
||||
ret = orte_plm.terminate_orteds();
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
/* If we failed the terminate_orteds() above, then we
|
||||
* need to just die
|
||||
*/
|
||||
just_quit(fd, ign, arg);
|
||||
}
|
||||
/* give ourselves a time limit on how long to wait
|
||||
* for the job to die, just in case we can't make it go
|
||||
* away for some reason. Don't send us directly back
|
||||
* to job_completed, though, as that function may be
|
||||
* what has failed
|
||||
*/
|
||||
ORTE_DETECT_TIMEOUT(&abort_exit_event, jdata->num_procs,
|
||||
orte_timeout_usec_per_proc,
|
||||
orte_max_timeout,
|
||||
just_quit);
|
||||
|
||||
} else {
|
||||
/* if the jobid is invalid or we never launched,
|
||||
* there is nothing to do but just clean ourselves
|
||||
* up and exit
|
||||
*/
|
||||
just_quit(fd, ign, arg);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to terminate the job and wait for callback indicating
|
||||
* the job has been aborted.
|
||||
*/
|
||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
/* if we have already ordered this once, don't keep
|
||||
* doing it to avoid race conditions
|
||||
*/
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
if (forcibly_die) {
|
||||
/* kill any local procs */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* exit with a non-zero status */
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orterun_basename);
|
||||
forcibly_die = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* set the global abnormal exit flag so we know not to
|
||||
* use the standard xcast for terminating orteds
|
||||
*/
|
||||
orte_abnormal_term_ordered = true;
|
||||
/* ensure that the forwarding of stdin stops */
|
||||
orte_job_term_ordered = true;
|
||||
|
||||
/* tell us to be quiet - hey, the user killed us with a ctrl-c,
|
||||
* so need to tell them that!
|
||||
*/
|
||||
orterun_globals.quiet = true;
|
||||
|
||||
/* We are in an event handler; the job completed procedure
|
||||
will delete the signal handler that is currently running
|
||||
(which is a Bad Thing), so we can't call it directly.
|
||||
Instead, we have to exit this handler and setup to call
|
||||
job_completed() after this. */
|
||||
ORTE_TIMER_EVENT(0, 0, abort_exit_callback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deal with sigpipe errors
|
||||
*/
|
||||
static void epipe_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
/* for now, we just announce and ignore them */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_verbosity,
|
||||
"%s reports a SIGPIPE error on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd));
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass user signals to the remote application processes
|
||||
*/
|
||||
static void signal_forward_callback(int fd, short event, void *arg)
|
||||
{
|
||||
struct opal_event *signal = (struct opal_event*)arg;
|
||||
int signum, ret;
|
||||
|
||||
signum = OPAL_EVENT_SIGNAL(signal);
|
||||
if (!orterun_globals.quiet){
|
||||
fprintf(stderr, "%s: Forwarding signal %d to job\n",
|
||||
orterun_basename, signum);
|
||||
}
|
||||
|
||||
/** send the signal out to the processes, including any descendants */
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.signal_job(jdata->jobid, signum))) {
|
||||
fprintf(stderr, "Signal %d could not be sent to the job (returned %d)",
|
||||
signum, ret);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int init_globals(void)
|
||||
{
|
||||
/* Only CONSTRUCT things once */
|
||||
@ -1389,7 +784,6 @@ static int init_globals(void)
|
||||
orterun_globals.help = false;
|
||||
orterun_globals.version = false;
|
||||
orterun_globals.verbose = false;
|
||||
orterun_globals.quiet = false;
|
||||
orterun_globals.by_node = false;
|
||||
orterun_globals.by_slot = false;
|
||||
orterun_globals.by_board = false;
|
||||
@ -1429,13 +823,13 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
if (orterun_globals.version &&
|
||||
!(1 == argc || orterun_globals.help)) {
|
||||
char *project_name = NULL;
|
||||
if (0 == strcmp(orterun_basename, "mpirun")) {
|
||||
if (0 == strcmp(orte_basename, "mpirun")) {
|
||||
project_name = "Open MPI";
|
||||
} else {
|
||||
project_name = "OpenRTE";
|
||||
}
|
||||
orte_show_help("help-orterun.txt", "orterun:version", false,
|
||||
orterun_basename, project_name, OPAL_VERSION,
|
||||
orte_basename, project_name, OPAL_VERSION,
|
||||
PACKAGE_BUGREPORT);
|
||||
/* if we were the only argument, exit */
|
||||
if (2 == argc) exit(0);
|
||||
@ -1445,15 +839,15 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
if (1 == argc || orterun_globals.help) {
|
||||
char *args = NULL;
|
||||
char *project_name = NULL;
|
||||
if (0 == strcmp(orterun_basename, "mpirun")) {
|
||||
if (0 == strcmp(orte_basename, "mpirun")) {
|
||||
project_name = "Open MPI";
|
||||
} else {
|
||||
project_name = "OpenRTE";
|
||||
}
|
||||
args = opal_cmd_line_get_usage_msg(cmd_line);
|
||||
orte_show_help("help-orterun.txt", "orterun:usage", false,
|
||||
orterun_basename, project_name, OPAL_VERSION,
|
||||
orterun_basename, args,
|
||||
orte_basename, project_name, OPAL_VERSION,
|
||||
orte_basename, args,
|
||||
PACKAGE_BUGREPORT);
|
||||
free(args);
|
||||
|
||||
@ -1474,7 +868,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
fp = fopen(orterun_globals.report_pid, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orterun_basename, "pid", orterun_globals.report_pid);
|
||||
orte_basename, "pid", orterun_globals.report_pid);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(fp, "%d\n", (int)getpid());
|
||||
@ -1485,7 +879,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
/* Do we want a user-level debugger? */
|
||||
|
||||
if (orterun_globals.debugger) {
|
||||
run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
||||
run_debugger(orte_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
||||
}
|
||||
|
||||
/* extract any rank assignment policy directives */
|
||||
@ -1555,7 +949,7 @@ static int parse_locals(int argc, char* argv[])
|
||||
if (NULL == filename) {
|
||||
/* filename is not correctly formatted */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
|
||||
orterun_basename, orterun_globals.ompi_server);
|
||||
orte_basename, orterun_globals.ompi_server);
|
||||
exit(1);
|
||||
}
|
||||
++filename; /* space past the : */
|
||||
@ -1563,7 +957,7 @@ static int parse_locals(int argc, char* argv[])
|
||||
if (0 >= strlen(filename)) {
|
||||
/* they forgot to give us the name! */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
|
||||
orterun_basename, orterun_globals.ompi_server);
|
||||
orte_basename, orterun_globals.ompi_server);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -1571,15 +965,15 @@ static int parse_locals(int argc, char* argv[])
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) { /* can't find or read file! */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
|
||||
orterun_basename, orterun_globals.ompi_server);
|
||||
orte_basename, orterun_globals.ompi_server);
|
||||
exit(1);
|
||||
}
|
||||
if (NULL == fgets(input, 1024, fp)) {
|
||||
/* something malformed about file */
|
||||
fclose(fp);
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
|
||||
orterun_basename, orterun_globals.ompi_server,
|
||||
orterun_basename);
|
||||
orte_basename, orterun_globals.ompi_server,
|
||||
orte_basename);
|
||||
exit(1);
|
||||
}
|
||||
fclose(fp);
|
||||
@ -1597,8 +991,8 @@ static int parse_locals(int argc, char* argv[])
|
||||
if (NULL == ptr) {
|
||||
/* pid is not correctly formatted */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true,
|
||||
orterun_basename, orterun_basename,
|
||||
orterun_globals.ompi_server, orterun_basename);
|
||||
orte_basename, orte_basename,
|
||||
orterun_globals.ompi_server, orte_basename);
|
||||
exit(1);
|
||||
}
|
||||
++ptr; /* space past the : */
|
||||
@ -1606,8 +1000,8 @@ static int parse_locals(int argc, char* argv[])
|
||||
if (0 >= strlen(ptr)) {
|
||||
/* they forgot to give us the pid! */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true,
|
||||
orterun_basename, orterun_basename,
|
||||
orterun_globals.ompi_server, orterun_basename);
|
||||
orte_basename, orte_basename,
|
||||
orterun_globals.ompi_server, orte_basename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -1621,7 +1015,7 @@ static int parse_locals(int argc, char* argv[])
|
||||
&orte_process_info.top_session_dir,
|
||||
NULL, NULL, NULL))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true,
|
||||
orterun_basename, orterun_basename);
|
||||
orte_basename, orte_basename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -1630,7 +1024,7 @@ static int parse_locals(int argc, char* argv[])
|
||||
/* get the list of HNPs, but do -not- setup contact info to them in the RML */
|
||||
if (ORTE_SUCCESS != (rc = orte_list_local_hnps(&hnp_list, false))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true,
|
||||
orterun_basename, orterun_basename);
|
||||
orte_basename, orte_basename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -1645,8 +1039,8 @@ static int parse_locals(int argc, char* argv[])
|
||||
}
|
||||
/* if we got here, it wasn't found */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-not-found", true,
|
||||
orterun_basename, orterun_basename, pid, orterun_globals.ompi_server,
|
||||
orterun_basename);
|
||||
orte_basename, orte_basename, pid, orterun_globals.ompi_server,
|
||||
orte_basename);
|
||||
OBJ_DESTRUCT(&hnp_list);
|
||||
exit(1);
|
||||
hnp_found:
|
||||
@ -1843,7 +1237,7 @@ static int capture_cmd_line_params(int argc, int start, char **argv)
|
||||
* and abort as we cannot know which one is correct
|
||||
*/
|
||||
orte_show_help("help-orterun.txt", "orterun:conflicting-params",
|
||||
true, orterun_basename, argv[i+1],
|
||||
true, orte_basename, argv[i+1],
|
||||
argv[i+2], orted_cmd_line[j+1]);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
@ -1945,7 +1339,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
|
||||
if (0 == count) {
|
||||
orte_show_help("help-orterun.txt", "orterun:executable-not-specified",
|
||||
true, orterun_basename, orterun_basename);
|
||||
true, orte_basename, orte_basename);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -2102,7 +1496,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
param_len--;
|
||||
if (0 == param_len) {
|
||||
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
|
||||
true, orterun_basename, orterun_basename);
|
||||
true, orte_basename, orte_basename);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
}
|
||||
@ -2118,7 +1512,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
|
||||
if(1 < j) {
|
||||
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
|
||||
true, orterun_basename, NULL);
|
||||
true, orte_basename, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
} else {
|
||||
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
|
||||
@ -2128,7 +1522,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
|
||||
if(1 < j || NULL != app->hostfile) {
|
||||
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
|
||||
true, orterun_basename, NULL);
|
||||
true, orte_basename, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
} else {
|
||||
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
|
||||
@ -2169,7 +1563,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
* then give us another application.
|
||||
*/
|
||||
orte_show_help("help-orterun.txt", "orterun:multi-apps-and-zero-np",
|
||||
true, orterun_basename, NULL);
|
||||
true, orte_basename, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
@ -2195,7 +1589,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
app->app = strdup(app->argv[0]);
|
||||
if (NULL == app->app) {
|
||||
orte_show_help("help-orterun.txt", "orterun:call-failed",
|
||||
true, orterun_basename, "library", "strdup returned NULL", errno);
|
||||
true, orte_basename, "library", "strdup returned NULL", errno);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -37,7 +37,6 @@ struct orterun_globals_t {
|
||||
bool help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
bool quiet;
|
||||
char *report_pid;
|
||||
char *report_uri;
|
||||
bool exit;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user