1
1

When we hit an error prior to actually launching daemons, it would be nice if orterun didn't bark about daemons failing to launch, mpirun detecting a job failed, etc.

Add a new job state to indicate that we never attempted to launch. Flag such a scenario and avoid hitting all the other error messages.

This commit was SVN r19366.
Этот коммит содержится в:
Ralph Castain 2008-08-19 15:19:30 +00:00
родитель 9447334749
Коммит 4e0f34a062
12 изменённых файлов: 67 добавлений и 15 удалений

Просмотреть файл

@ -149,7 +149,8 @@ static int plm_alps_launch_job(orte_job_t *jdata)
orte_node_t **nodes;
orte_std_cntr_t nnode;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* default to declaring the daemon launch failed */
failed_job = ORTE_PROC_MY_NAME->jobid;
@ -341,6 +342,9 @@ static int plm_alps_launch_job(orte_job_t *jdata)
}
}
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/* exec the daemon(s) */
if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
ORTE_ERROR_LOG(rc);
@ -403,7 +407,7 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
return rc;

Просмотреть файл

@ -244,6 +244,14 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
/* if we didn't even attempt to launch, then just quietly update
* the job record and leave
*/
if (ORTE_JOB_NEVER_LAUNCHED == state) {
orte_never_launched = true;
goto PROCESS;
}
/* if this is the daemon job that failed, set the flag indicating
* that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
@ -282,7 +290,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
free(pidstr);
}
PROCESS:
/* Set the job state as indicated so orterun's exit status
will be non-zero
*/

Просмотреть файл

@ -152,6 +152,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
JobPriority job_priority = JobPriority_Normal;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* default to declaring the daemon launch failed */
failed_job = ORTE_PROC_MY_NAME->jobid;
@ -364,7 +365,10 @@ GETMAP:
hr = pJob->SetExtendedJobTerm(_bstr_t(L"extended terms"), _bstr_t(L"TermValue"));
/* Iterate through each of the nodes and spin
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/* Iterate through each of the nodes and spin
* up a daemon.
*/
for (i = 0; i < map->num_nodes; i++) {
@ -566,7 +570,7 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
/* check for timing request - get stop time and process if so */

Просмотреть файл

@ -143,7 +143,8 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
orte_node_t **nodes;
orte_std_cntr_t nnode;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* default to declaring the daemons failed*/
failed_job = ORTE_PROC_MY_NAME->jobid;
@ -290,6 +291,9 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
}
}
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/* lsb_launch tampers with SIGCHLD.
* After the call to lsb_launch, the signal handler for SIGCHLD is NULL.
* So, we disable the SIGCHLD handler of libevent for the duration of
@ -364,7 +368,7 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
return rc;

Просмотреть файл

@ -85,8 +85,14 @@ typedef uint16_t orte_job_state_t;
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
/* the job never even attempted to launch due to an error earlier in the
* launch procedure
*/
#define ORTE_JOB_NEVER_LAUNCHED 0x1000
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
/**
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,

Просмотреть файл

@ -461,7 +461,8 @@ int orte_plm_process_launch(orte_job_t *jdata)
orte_app_context_t **apps;
orte_node_t **nodes;
orte_std_cntr_t nnode;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
if (mca_plm_process_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
opal_output(0, "plm_process: could not obtain start time");
@ -607,6 +608,9 @@ int orte_plm_process_launch(orte_job_t *jdata)
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/*
* Iterate through each of the nodes
*/
@ -860,7 +864,7 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if( failed_launch ) {
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
return rc;

Просмотреть файл

@ -946,6 +946,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
orte_node_t **nodes;
orte_std_cntr_t nnode;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* default to declaring the daemon launch as having failed */
failed_job = ORTE_PROC_MY_NAME->jobid;
@ -1088,6 +1089,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
find_children(0, 0, 0, jdatorted->num_procs);
}
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/*
* Iterate through each of the nodes
*/
@ -1237,7 +1241,7 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
/* setup a "heartbeat" timer to periodically check on

Просмотреть файл

@ -147,7 +147,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
struct timeval launchstart, launchstop;
int proc_vpid_index;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* flag the daemons as failing by default */
failed_job = ORTE_PROC_MY_NAME->jobid;
@ -334,6 +335,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
}
}
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/* setup environment */
env = opal_argv_copy(orte_launch_environ);
@ -406,7 +410,7 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
return rc;

Просмотреть файл

@ -148,7 +148,8 @@ static int plm_tm_launch_job(orte_job_t *jdata)
bool failed_launch = true;
mode_t current_umask;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* default to declaring the daemons as failed */
failed_job = ORTE_PROC_MY_NAME->jobid;
@ -273,6 +274,9 @@ static int plm_tm_launch_job(orte_job_t *jdata)
}
}
/* set the job state to indicate we attempted to launch */
job_state = ORTE_JOB_STATE_FAILED_TO_START;
/* Iterate through each of the nodes and spin
* up a daemon.
*/
@ -413,7 +417,7 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
}
/* setup a "heartbeat" timer to periodically check on

Просмотреть файл

@ -58,6 +58,7 @@ int orted_debug_failure;
int orted_debug_failure_delay;
bool orte_homogeneous_nodes = false;
bool orte_hetero_apps = false;
bool orte_never_launched = false;
int32_t orte_contiguous_nodes;
int orte_debug_output = -1;

Просмотреть файл

@ -380,6 +380,7 @@ ORTE_DECLSPEC extern int orted_debug_failure;
ORTE_DECLSPEC extern int orted_debug_failure_delay;
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
ORTE_DECLSPEC extern bool orte_hetero_apps;
ORTE_DECLSPEC extern bool orte_never_launched;
ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;

Просмотреть файл

@ -643,6 +643,14 @@ static void job_completed(int trigpipe, short event, void *arg)
exit_state = jdata->state;
/* if we never launched, just skip this part to avoid
* meaningless error messages
*/
if (orte_never_launched) {
rc = orte_exit_status;
goto DONE;
}
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
/* abnormal termination of some kind */
dump_aborted_procs();