When we hit an error prior to actually launching daemons, it would be nice if orterun didn't bark about daemons failing to launch, mpirun detecting a job failed, etc.
Add a new job state to indicate that we never attempted to launch. Flag such a scenario and avoid hitting all the other error messages. This commit was SVN r19366.
Этот коммит содержится в:
родитель
9447334749
Коммит
4e0f34a062
@ -149,7 +149,8 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
orte_node_t **nodes;
|
||||
orte_std_cntr_t nnode;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* default to declaring the daemon launch failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -341,6 +342,9 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* exec the daemon(s) */
|
||||
if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -403,7 +407,7 @@ cleanup:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -244,6 +244,14 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
/* if we didn't even attempt to launch, then just quietly update
|
||||
* the job record and leave
|
||||
*/
|
||||
if (ORTE_JOB_NEVER_LAUNCHED == state) {
|
||||
orte_never_launched = true;
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/* if this is the daemon job that failed, set the flag indicating
|
||||
* that a daemon failed so we use the proper
|
||||
* methods for attempting to shutdown the rest of the system
|
||||
@ -282,7 +290,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
|
||||
free(pidstr);
|
||||
}
|
||||
|
||||
|
||||
PROCESS:
|
||||
/* Set the job state as indicated so orterun's exit status
|
||||
will be non-zero
|
||||
*/
|
||||
|
@ -152,6 +152,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
|
||||
JobPriority job_priority = JobPriority_Normal;
|
||||
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* default to declaring the daemon launch failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
@ -364,7 +365,10 @@ GETMAP:
|
||||
|
||||
hr = pJob->SetExtendedJobTerm(_bstr_t(L"extended terms"), _bstr_t(L"TermValue"));
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
for (i = 0; i < map->num_nodes; i++) {
|
||||
@ -566,7 +570,7 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and process if so */
|
||||
|
@ -143,7 +143,8 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
||||
orte_node_t **nodes;
|
||||
orte_std_cntr_t nnode;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* default to declaring the daemons failed*/
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -290,6 +291,9 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* lsb_launch tampers with SIGCHLD.
|
||||
* After the call to lsb_launch, the signal handler for SIGCHLD is NULL.
|
||||
* So, we disable the SIGCHLD handler of libevent for the duration of
|
||||
@ -364,7 +368,7 @@ cleanup:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -85,8 +85,14 @@ typedef uint16_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
|
||||
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
|
||||
|
||||
/* the job never even attempted to launch due to an error earlier in the
|
||||
* launch procedure
|
||||
*/
|
||||
#define ORTE_JOB_NEVER_LAUNCHED 0x1000
|
||||
|
||||
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
|
||||
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
|
||||
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
|
||||
|
||||
|
||||
/**
|
||||
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,
|
||||
|
@ -461,7 +461,8 @@ int orte_plm_process_launch(orte_job_t *jdata)
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
orte_std_cntr_t nnode;
|
||||
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
if (mca_plm_process_component.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
opal_output(0, "plm_process: could not obtain start time");
|
||||
@ -607,6 +608,9 @@ int orte_plm_process_launch(orte_job_t *jdata)
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/*
|
||||
* Iterate through each of the nodes
|
||||
*/
|
||||
@ -860,7 +864,7 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if( failed_launch ) {
|
||||
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -946,6 +946,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
orte_node_t **nodes;
|
||||
orte_std_cntr_t nnode;
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* default to declaring the daemon launch as having failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
@ -1088,6 +1089,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
find_children(0, 0, 0, jdatorted->num_procs);
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/*
|
||||
* Iterate through each of the nodes
|
||||
*/
|
||||
@ -1237,7 +1241,7 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
|
@ -147,7 +147,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
struct timeval launchstart, launchstop;
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* flag the daemons as failing by default */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -334,6 +335,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
@ -406,7 +410,7 @@ cleanup:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -148,7 +148,8 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -273,6 +274,9 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* set the job state to indicate we attempted to launch */
|
||||
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
@ -413,7 +417,7 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||
}
|
||||
|
||||
/* setup a "heartbeat" timer to periodically check on
|
||||
|
@ -58,6 +58,7 @@ int orted_debug_failure;
|
||||
int orted_debug_failure_delay;
|
||||
bool orte_homogeneous_nodes = false;
|
||||
bool orte_hetero_apps = false;
|
||||
bool orte_never_launched = false;
|
||||
|
||||
int32_t orte_contiguous_nodes;
|
||||
int orte_debug_output = -1;
|
||||
|
@ -380,6 +380,7 @@ ORTE_DECLSPEC extern int orted_debug_failure;
|
||||
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
||||
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
|
||||
ORTE_DECLSPEC extern bool orte_hetero_apps;
|
||||
ORTE_DECLSPEC extern bool orte_never_launched;
|
||||
|
||||
ORTE_DECLSPEC extern char **orte_launch_environ;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
|
||||
|
@ -643,6 +643,14 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
|
||||
exit_state = jdata->state;
|
||||
|
||||
/* if we never launched, just skip this part to avoid
|
||||
* meaningless error messages
|
||||
*/
|
||||
if (orte_never_launched) {
|
||||
rc = orte_exit_status;
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
|
||||
/* abnormal termination of some kind */
|
||||
dump_aborted_procs();
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user