Add some debug output to job_complete. If no session dirs were created, then cannot check for abort file - which wouldn't be created anyway
This commit was SVN r22903.
Этот коммит содержится в:
родитель
eaed49594c
Коммит
f6bfaa76ba
@ -2774,91 +2774,93 @@ GOTCHILD:
|
|||||||
* of an "abort" file in this process' session directory. If
|
* of an "abort" file in this process' session directory. If
|
||||||
* we find it, then we know that this was an abnormal termination.
|
* we find it, then we know that this was an abnormal termination.
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job, child->name->jobid))) {
|
if (orte_create_session_dirs) {
|
||||||
ORTE_ERROR_LOG(rc);
|
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job, child->name->jobid))) {
|
||||||
goto MOVEON;
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
goto MOVEON;
|
||||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, child->name->vpid))) {
|
}
|
||||||
ORTE_ERROR_LOG(rc);
|
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, child->name->vpid))) {
|
||||||
free(job);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto MOVEON;
|
free(job);
|
||||||
}
|
goto MOVEON;
|
||||||
abort_file = opal_os_path(false, orte_process_info.tmpdir_base,
|
}
|
||||||
orte_process_info.top_session_dir,
|
abort_file = opal_os_path(false, orte_process_info.tmpdir_base,
|
||||||
job, vpid, "abort", NULL );
|
orte_process_info.top_session_dir,
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
job, vpid, "abort", NULL );
|
||||||
"%s odls:waitpid_fired checking abort file %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file));
|
|
||||||
|
|
||||||
free(job);
|
|
||||||
free(vpid);
|
|
||||||
if (0 == stat(abort_file, &buf)) {
|
|
||||||
/* the abort file must exist - there is nothing in it we need. It's
|
|
||||||
* meer existence indicates that an abnormal termination occurred
|
|
||||||
*/
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:waitpid_fired child %s died by abort",
|
"%s odls:waitpid_fired checking abort file %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file));
|
||||||
ORTE_NAME_PRINT(child->name)));
|
|
||||||
|
|
||||||
child->state = ORTE_PROC_STATE_ABORTED;
|
free(job);
|
||||||
|
free(vpid);
|
||||||
|
if (0 == stat(abort_file, &buf)) {
|
||||||
|
/* the abort file must exist - there is nothing in it we need. It's
|
||||||
|
* meer existence indicates that an abnormal termination occurred
|
||||||
|
*/
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child %s died by abort",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
|
child->state = ORTE_PROC_STATE_ABORTED;
|
||||||
|
free(abort_file);
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
free(abort_file);
|
free(abort_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check to see if a sync was required and if it was received */
|
||||||
|
if (child->init_recvd) {
|
||||||
|
if (!child->fini_recvd) {
|
||||||
|
/* we required a finalizing sync and didn't get it, so this
|
||||||
|
* is considered an abnormal termination and treated accordingly
|
||||||
|
*/
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
|
"but did not provide a required finalize sync - it "
|
||||||
|
"will be treated as an abnormal termination",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
/* if we did recv a finalize sync, then it terminated normally */
|
||||||
|
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
} else {
|
} else {
|
||||||
/* okay, it terminated normally - check to see if a sync was required and
|
/* has any child in this job already registered? */
|
||||||
* if it was received
|
for (item = opal_list_get_first(&orte_local_children);
|
||||||
*/
|
item != opal_list_get_end(&orte_local_children);
|
||||||
if (child->init_recvd) {
|
item = opal_list_get_next(item)) {
|
||||||
if (!child->fini_recvd) {
|
chd = (orte_odls_child_t*)item;
|
||||||
/* we required a finalizing sync and didn't get it, so this
|
|
||||||
* is considered an abnormal termination and treated accordingly
|
if (chd->init_recvd) {
|
||||||
|
/* someone has registered, and we didn't before
|
||||||
|
* terminating - this is an abnormal termination
|
||||||
*/
|
*/
|
||||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:waitpid_fired child process %s terminated normally "
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
"but did not provide a required finalize sync - it "
|
"but did not provide a required init sync - it "
|
||||||
"will be treated as an abnormal termination",
|
"will be treated as an abnormal termination",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
}
|
}
|
||||||
/* if we did recv a finalize sync, then it terminated normally */
|
|
||||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
|
||||||
} else {
|
|
||||||
/* has any child in this job already registered? */
|
|
||||||
for (item = opal_list_get_first(&orte_local_children);
|
|
||||||
item != opal_list_get_end(&orte_local_children);
|
|
||||||
item = opal_list_get_next(item)) {
|
|
||||||
chd = (orte_odls_child_t*)item;
|
|
||||||
|
|
||||||
if (chd->init_recvd) {
|
|
||||||
/* someone has registered, and we didn't before
|
|
||||||
* terminating - this is an abnormal termination
|
|
||||||
*/
|
|
||||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
|
||||||
"%s odls:waitpid_fired child process %s terminated normally "
|
|
||||||
"but did not provide a required init sync - it "
|
|
||||||
"will be treated as an abnormal termination",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(child->name)));
|
|
||||||
|
|
||||||
goto MOVEON;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* if no child has registered, then it is possible that
|
|
||||||
* none of them will. This is considered acceptable
|
|
||||||
*/
|
|
||||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
|
||||||
}
|
}
|
||||||
|
/* if no child has registered, then it is possible that
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
* none of them will. This is considered acceptable
|
||||||
"%s odls:waitpid_fired child process %s terminated normally",
|
*/
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
ORTE_NAME_PRINT(child->name)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child process %s terminated normally",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
} else {
|
} else {
|
||||||
/* the process was terminated with a signal! That's definitely
|
/* the process was terminated with a signal! That's definitely
|
||||||
* abnormal, so indicate that condition
|
* abnormal, so indicate that condition
|
||||||
|
@ -1405,6 +1405,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
* Determine how the process state affects the job state
|
* Determine how the process state affects the job state
|
||||||
*/
|
*/
|
||||||
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
|
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:check_job_completed proc %s failed to start",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||||
if (!jdata->abort) {
|
if (!jdata->abort) {
|
||||||
/* point to the lowest rank to cause the problem */
|
/* point to the lowest rank to cause the problem */
|
||||||
@ -1415,6 +1419,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||||
}
|
}
|
||||||
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
|
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:check_job_completed proc %s aborted",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||||
if (!jdata->abort) {
|
if (!jdata->abort) {
|
||||||
/* point to the lowest rank to cause the problem */
|
/* point to the lowest rank to cause the problem */
|
||||||
@ -1425,6 +1433,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||||
}
|
}
|
||||||
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
|
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:check_job_completed proc %s aborted by signal",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
|
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
|
||||||
if (!jdata->abort) {
|
if (!jdata->abort) {
|
||||||
/* point to the lowest rank to cause the problem */
|
/* point to the lowest rank to cause the problem */
|
||||||
@ -1435,6 +1447,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||||
}
|
}
|
||||||
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
|
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:check_job_completed proc %s terminated without sync",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
||||||
if (!jdata->abort) {
|
if (!jdata->abort) {
|
||||||
/* point to the lowest rank to cause the problem */
|
/* point to the lowest rank to cause the problem */
|
||||||
@ -1451,6 +1467,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
}
|
}
|
||||||
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:check_job_completed proc %s killed by cmd",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
/* we ordered this proc to die, so it isn't an abnormal termination
|
||||||
* and we don't flag it as such - just check the remaining jobs to
|
* and we don't flag it as such - just check the remaining jobs to
|
||||||
* see if anyone is still alive
|
* see if anyone is still alive
|
||||||
@ -1464,6 +1484,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
goto CHECK_ALL_JOBS;
|
goto CHECK_ALL_JOBS;
|
||||||
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
||||||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
|
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:check_job_completed proc %s terminated and continuous",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
proc->state = ORTE_PROC_STATE_ABORTED;
|
proc->state = ORTE_PROC_STATE_ABORTED;
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||||
if (!jdata->abort) {
|
if (!jdata->abort) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user