diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 789357dc81..660d002c13 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -149,7 +149,8 @@ static int plm_alps_launch_job(orte_job_t *jdata) orte_node_t **nodes; orte_std_cntr_t nnode; orte_jobid_t failed_job; - + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + /* default to declaring the daemon launch failed */ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -341,6 +342,9 @@ static int plm_alps_launch_job(orte_job_t *jdata) } } + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); @@ -403,7 +407,7 @@ cleanup: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } return rc; diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 198aa4bcaa..8b25144900 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -244,6 +244,14 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); + /* if we didn't even attempt to launch, then just quietly update + * the job record and leave + */ + if (ORTE_JOB_NEVER_LAUNCHED == state) { + orte_never_launched = true; + goto PROCESS; + } + /* if this is the daemon job that failed, set the flag indicating * that a daemon failed so we use the proper * methods for attempting to shutdown the rest of the system @@ -282,7 +290,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid, free(pidstr); } - +PROCESS: /* Set the job state as indicated so orterun's exit status will be non-zero */ diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index 674a24a4b5..3454e24974 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -152,6 +152,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata) JobPriority job_priority = JobPriority_Normal; orte_jobid_t failed_job; + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; /* default to declaring the daemon launch failed */ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -364,7 +365,10 @@ GETMAP: hr = pJob->SetExtendedJobTerm(_bstr_t(L"extended terms"), _bstr_t(L"TermValue")); - /* Iterate through each of the nodes and spin + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + + /* Iterate through each of the nodes and spin * up a daemon. */ for (i = 0; i < map->num_nodes; i++) { @@ -566,7 +570,7 @@ launch_apps: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } /* check for timing request - get stop time and process if so */ diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index 17feb3e482..50522a3d23 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -143,7 +143,8 @@ static int plm_lsf_launch_job(orte_job_t *jdata) orte_node_t **nodes; orte_std_cntr_t nnode; orte_jobid_t failed_job; - + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + /* default to declaring the daemons failed*/ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -290,6 +291,9 @@ static int plm_lsf_launch_job(orte_job_t *jdata) } } + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + /* lsb_launch tampers with SIGCHLD. * After the call to lsb_launch, the signal handler for SIGCHLD is NULL. * So, we disable the SIGCHLD handler of libevent for the duration of @@ -364,7 +368,7 @@ cleanup: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } return rc; diff --git a/orte/mca/plm/plm_types.h b/orte/mca/plm/plm_types.h index 5b8929e560..8ff919391e 100644 --- a/orte/mca/plm/plm_types.h +++ b/orte/mca/plm/plm_types.h @@ -85,8 +85,14 @@ typedef uint16_t orte_job_state_t; #define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */ #define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */ +/* the job never even attempted to launch due to an error earlier in the + * launch procedure + */ +#define ORTE_JOB_NEVER_LAUNCHED 0x1000 + /* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */ -#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000 +#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000 + /** * Node State, corresponding to the ORTE_NODE_STATE_* #defines, diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index 5ac128d028..aa7307c64f 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -461,7 +461,8 @@ int orte_plm_process_launch(orte_job_t *jdata) orte_app_context_t **apps; orte_node_t **nodes; orte_std_cntr_t nnode; - + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + if (mca_plm_process_component.timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { opal_output(0, "plm_process: could not obtain start time"); @@ -607,6 +608,9 @@ int orte_plm_process_launch(orte_job_t *jdata) lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + /* * Iterate through each of the nodes */ @@ -860,7 +864,7 @@ launch_apps: /* check for failed launch - if so, force terminate */ if( failed_launch ) { - orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } return rc; diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 3fe24881d7..927b7861ba 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -946,6 +946,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata) orte_node_t **nodes; orte_std_cntr_t nnode; orte_jobid_t failed_job; + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; /* default to declaring the daemon launch as having failed */ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -1088,6 +1089,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata) find_children(0, 0, 0, jdatorted->num_procs); } + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + /* * Iterate through each of the nodes */ @@ -1237,7 +1241,7 @@ launch_apps: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } /* setup a "heartbeat" timer to periodically check on diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index d258e55627..7fc06a2246 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -147,7 +147,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata) struct timeval launchstart, launchstop; int proc_vpid_index; orte_jobid_t failed_job; - + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + /* flag the daemons as failing by default */ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -334,6 +335,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata) } } + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + /* setup environment */ env = opal_argv_copy(orte_launch_environ); @@ -406,7 +410,7 @@ cleanup: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } return rc; diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 6c09839d66..e510d6ed16 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -148,7 +148,8 @@ static int plm_tm_launch_job(orte_job_t *jdata) bool failed_launch = true; mode_t current_umask; orte_jobid_t failed_job; - + orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED; + /* default to declaring the daemons as failed */ failed_job = ORTE_PROC_MY_NAME->jobid; @@ -273,6 +274,9 @@ static int plm_tm_launch_job(orte_job_t *jdata) } } + /* set the job state to indicate we attempted to launch */ + job_state = ORTE_JOB_STATE_FAILED_TO_START; + /* Iterate through each of the nodes and spin * up a daemon. */ @@ -413,7 +417,7 @@ launch_apps: /* check for failed launch - if so, force terminate */ if (failed_launch) { - orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); + orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state); } /* setup a "heartbeat" timer to periodically check on diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index f91836f12c..b5070e1605 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -58,6 +58,7 @@ int orted_debug_failure; int orted_debug_failure_delay; bool orte_homogeneous_nodes = false; bool orte_hetero_apps = false; +bool orte_never_launched = false; int32_t orte_contiguous_nodes; int orte_debug_output = -1; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 8a0a9f3f34..8d80abbee5 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -380,6 +380,7 @@ ORTE_DECLSPEC extern int orted_debug_failure; ORTE_DECLSPEC extern int orted_debug_failure_delay; ORTE_DECLSPEC extern bool orte_homogeneous_nodes; ORTE_DECLSPEC extern bool orte_hetero_apps; +ORTE_DECLSPEC extern bool orte_never_launched; ORTE_DECLSPEC extern char **orte_launch_environ; ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap; diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 08da031455..3067dbf928 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -643,6 +643,14 @@ static void job_completed(int trigpipe, short event, void *arg) exit_state = jdata->state; + /* if we never launched, just skip this part to avoid + * meaningless error messages + */ + if (orte_never_launched) { + rc = orte_exit_status; + goto DONE; + } + if (ORTE_JOB_STATE_TERMINATED != exit_state) { /* abnormal termination of some kind */ dump_aborted_procs();