When we hit an error prior to actually launching daemons, it would be nice if orterun didn't bark about daemons failing to launch, mpirun detecting a job failed, etc.
Add a new job state to indicate that we never attempted to launch. Flag such a scenario and avoid hitting all the other error messages. This commit was SVN r19366.
Этот коммит содержится в:
родитель
9447334749
Коммит
4e0f34a062
@ -149,6 +149,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
|||||||
orte_node_t **nodes;
|
orte_node_t **nodes;
|
||||||
orte_std_cntr_t nnode;
|
orte_std_cntr_t nnode;
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
/* default to declaring the daemon launch failed */
|
/* default to declaring the daemon launch failed */
|
||||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||||
@ -341,6 +342,9 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
/* exec the daemon(s) */
|
/* exec the daemon(s) */
|
||||||
if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
|
if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -403,7 +407,7 @@ cleanup:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -244,6 +244,14 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_JOBID_PRINT(job)));
|
ORTE_JOBID_PRINT(job)));
|
||||||
|
|
||||||
|
/* if we didn't even attempt to launch, then just quietly update
|
||||||
|
* the job record and leave
|
||||||
|
*/
|
||||||
|
if (ORTE_JOB_NEVER_LAUNCHED == state) {
|
||||||
|
orte_never_launched = true;
|
||||||
|
goto PROCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* if this is the daemon job that failed, set the flag indicating
|
/* if this is the daemon job that failed, set the flag indicating
|
||||||
* that a daemon failed so we use the proper
|
* that a daemon failed so we use the proper
|
||||||
* methods for attempting to shutdown the rest of the system
|
* methods for attempting to shutdown the rest of the system
|
||||||
@ -282,7 +290,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
|
|||||||
free(pidstr);
|
free(pidstr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PROCESS:
|
||||||
/* Set the job state as indicated so orterun's exit status
|
/* Set the job state as indicated so orterun's exit status
|
||||||
will be non-zero
|
will be non-zero
|
||||||
*/
|
*/
|
||||||
|
@ -152,6 +152,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
|
|||||||
JobPriority job_priority = JobPriority_Normal;
|
JobPriority job_priority = JobPriority_Normal;
|
||||||
|
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
/* default to declaring the daemon launch failed */
|
/* default to declaring the daemon launch failed */
|
||||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||||
@ -364,7 +365,10 @@ GETMAP:
|
|||||||
|
|
||||||
hr = pJob->SetExtendedJobTerm(_bstr_t(L"extended terms"), _bstr_t(L"TermValue"));
|
hr = pJob->SetExtendedJobTerm(_bstr_t(L"extended terms"), _bstr_t(L"TermValue"));
|
||||||
|
|
||||||
/* Iterate through each of the nodes and spin
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
|
/* Iterate through each of the nodes and spin
|
||||||
* up a daemon.
|
* up a daemon.
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < map->num_nodes; i++) {
|
for (i = 0; i < map->num_nodes; i++) {
|
||||||
@ -566,7 +570,7 @@ launch_apps:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check for timing request - get stop time and process if so */
|
/* check for timing request - get stop time and process if so */
|
||||||
|
@ -143,6 +143,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
|||||||
orte_node_t **nodes;
|
orte_node_t **nodes;
|
||||||
orte_std_cntr_t nnode;
|
orte_std_cntr_t nnode;
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
/* default to declaring the daemons failed*/
|
/* default to declaring the daemons failed*/
|
||||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||||
@ -290,6 +291,9 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
/* lsb_launch tampers with SIGCHLD.
|
/* lsb_launch tampers with SIGCHLD.
|
||||||
* After the call to lsb_launch, the signal handler for SIGCHLD is NULL.
|
* After the call to lsb_launch, the signal handler for SIGCHLD is NULL.
|
||||||
* So, we disable the SIGCHLD handler of libevent for the duration of
|
* So, we disable the SIGCHLD handler of libevent for the duration of
|
||||||
@ -364,7 +368,7 @@ cleanup:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -85,9 +85,15 @@ typedef uint16_t orte_job_state_t;
|
|||||||
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
|
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
|
||||||
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
|
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
|
||||||
|
|
||||||
|
/* the job never even attempted to launch due to an error earlier in the
|
||||||
|
* launch procedure
|
||||||
|
*/
|
||||||
|
#define ORTE_JOB_NEVER_LAUNCHED 0x1000
|
||||||
|
|
||||||
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
|
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
|
||||||
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
|
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,
|
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,
|
||||||
* below. These are #defines instead of an enum because the thought
|
* below. These are #defines instead of an enum because the thought
|
||||||
|
@ -461,6 +461,7 @@ int orte_plm_process_launch(orte_job_t *jdata)
|
|||||||
orte_app_context_t **apps;
|
orte_app_context_t **apps;
|
||||||
orte_node_t **nodes;
|
orte_node_t **nodes;
|
||||||
orte_std_cntr_t nnode;
|
orte_std_cntr_t nnode;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
if (mca_plm_process_component.timing) {
|
if (mca_plm_process_component.timing) {
|
||||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
@ -607,6 +608,9 @@ int orte_plm_process_launch(orte_job_t *jdata)
|
|||||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||||
|
|
||||||
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Iterate through each of the nodes
|
* Iterate through each of the nodes
|
||||||
*/
|
*/
|
||||||
@ -860,7 +864,7 @@ launch_apps:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if( failed_launch ) {
|
if( failed_launch ) {
|
||||||
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -946,6 +946,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
orte_node_t **nodes;
|
orte_node_t **nodes;
|
||||||
orte_std_cntr_t nnode;
|
orte_std_cntr_t nnode;
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
/* default to declaring the daemon launch as having failed */
|
/* default to declaring the daemon launch as having failed */
|
||||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||||
@ -1088,6 +1089,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
find_children(0, 0, 0, jdatorted->num_procs);
|
find_children(0, 0, 0, jdatorted->num_procs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Iterate through each of the nodes
|
* Iterate through each of the nodes
|
||||||
*/
|
*/
|
||||||
@ -1237,7 +1241,7 @@ launch_apps:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup a "heartbeat" timer to periodically check on
|
/* setup a "heartbeat" timer to periodically check on
|
||||||
|
@ -147,6 +147,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
|||||||
struct timeval launchstart, launchstop;
|
struct timeval launchstart, launchstop;
|
||||||
int proc_vpid_index;
|
int proc_vpid_index;
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
/* flag the daemons as failing by default */
|
/* flag the daemons as failing by default */
|
||||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||||
@ -334,6 +335,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
/* setup environment */
|
/* setup environment */
|
||||||
env = opal_argv_copy(orte_launch_environ);
|
env = opal_argv_copy(orte_launch_environ);
|
||||||
|
|
||||||
@ -406,7 +410,7 @@ cleanup:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -148,6 +148,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
|||||||
bool failed_launch = true;
|
bool failed_launch = true;
|
||||||
mode_t current_umask;
|
mode_t current_umask;
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
/* default to declaring the daemons as failed */
|
/* default to declaring the daemons as failed */
|
||||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||||
@ -273,6 +274,9 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* set the job state to indicate we attempted to launch */
|
||||||
|
job_state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
|
|
||||||
/* Iterate through each of the nodes and spin
|
/* Iterate through each of the nodes and spin
|
||||||
* up a daemon.
|
* up a daemon.
|
||||||
*/
|
*/
|
||||||
@ -413,7 +417,7 @@ launch_apps:
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup a "heartbeat" timer to periodically check on
|
/* setup a "heartbeat" timer to periodically check on
|
||||||
|
@ -58,6 +58,7 @@ int orted_debug_failure;
|
|||||||
int orted_debug_failure_delay;
|
int orted_debug_failure_delay;
|
||||||
bool orte_homogeneous_nodes = false;
|
bool orte_homogeneous_nodes = false;
|
||||||
bool orte_hetero_apps = false;
|
bool orte_hetero_apps = false;
|
||||||
|
bool orte_never_launched = false;
|
||||||
|
|
||||||
int32_t orte_contiguous_nodes;
|
int32_t orte_contiguous_nodes;
|
||||||
int orte_debug_output = -1;
|
int orte_debug_output = -1;
|
||||||
|
@ -380,6 +380,7 @@ ORTE_DECLSPEC extern int orted_debug_failure;
|
|||||||
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
||||||
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
|
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
|
||||||
ORTE_DECLSPEC extern bool orte_hetero_apps;
|
ORTE_DECLSPEC extern bool orte_hetero_apps;
|
||||||
|
ORTE_DECLSPEC extern bool orte_never_launched;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern char **orte_launch_environ;
|
ORTE_DECLSPEC extern char **orte_launch_environ;
|
||||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
|
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
|
||||||
|
@ -643,6 +643,14 @@ static void job_completed(int trigpipe, short event, void *arg)
|
|||||||
|
|
||||||
exit_state = jdata->state;
|
exit_state = jdata->state;
|
||||||
|
|
||||||
|
/* if we never launched, just skip this part to avoid
|
||||||
|
* meaningless error messages
|
||||||
|
*/
|
||||||
|
if (orte_never_launched) {
|
||||||
|
rc = orte_exit_status;
|
||||||
|
goto DONE;
|
||||||
|
}
|
||||||
|
|
||||||
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
|
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
|
||||||
/* abnormal termination of some kind */
|
/* abnormal termination of some kind */
|
||||||
dump_aborted_procs();
|
dump_aborted_procs();
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user