From f6bfaa76baef39d35fa7d53551ff18177cb2d666 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 29 Mar 2010 23:21:03 +0000 Subject: [PATCH] Add some debug output to job_complete. If no session dirs were created, then cannot check for abort file - which wouldn't be created anyway This commit was SVN r22903. --- orte/mca/odls/base/odls_base_default_fns.c | 140 ++++++++++---------- orte/mca/plm/base/plm_base_launch_support.c | 24 ++++ 2 files changed, 95 insertions(+), 69 deletions(-) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 1b815408cb..9ff62cabbe 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -2774,91 +2774,93 @@ GOTCHILD: * of an "abort" file in this process' session directory. If * we find it, then we know that this was an abnormal termination. */ - if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job, child->name->jobid))) { - ORTE_ERROR_LOG(rc); - goto MOVEON; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, child->name->vpid))) { - ORTE_ERROR_LOG(rc); - free(job); - goto MOVEON; - } - abort_file = opal_os_path(false, orte_process_info.tmpdir_base, - orte_process_info.top_session_dir, - job, vpid, "abort", NULL ); - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired checking abort file %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file)); - - free(job); - free(vpid); - if (0 == stat(abort_file, &buf)) { - /* the abort file must exist - there is nothing in it we need. It's - * meer existence indicates that an abnormal termination occurred - */ - + if (orte_create_session_dirs) { + if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job, child->name->jobid))) { + ORTE_ERROR_LOG(rc); + goto MOVEON; + } + if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, child->name->vpid))) { + ORTE_ERROR_LOG(rc); + free(job); + goto MOVEON; + } + abort_file = opal_os_path(false, orte_process_info.tmpdir_base, + orte_process_info.top_session_dir, + job, vpid, "abort", NULL ); OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired child %s died by abort", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + "%s odls:waitpid_fired checking abort file %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file)); - child->state = ORTE_PROC_STATE_ABORTED; + free(job); + free(vpid); + if (0 == stat(abort_file, &buf)) { + /* the abort file must exist - there is nothing in it we need. It's + * meer existence indicates that an abnormal termination occurred + */ + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:waitpid_fired child %s died by abort", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + + child->state = ORTE_PROC_STATE_ABORTED; + free(abort_file); + goto MOVEON; + } free(abort_file); + } + + /* check to see if a sync was required and if it was received */ + if (child->init_recvd) { + if (!child->fini_recvd) { + /* we required a finalizing sync and didn't get it, so this + * is considered an abnormal termination and treated accordingly + */ + child->state = ORTE_PROC_STATE_TERM_WO_SYNC; + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:waitpid_fired child process %s terminated normally " + "but did not provide a required finalize sync - it " + "will be treated as an abnormal termination", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + + goto MOVEON; + } + /* if we did recv a finalize sync, then it terminated normally */ + child->state = ORTE_PROC_STATE_TERMINATED; } else { - /* okay, it terminated normally - check to see if a sync was required and - * if it was received - */ - if (child->init_recvd) { - if (!child->fini_recvd) { - /* we required a finalizing sync and didn't get it, so this - * is considered an abnormal termination and treated accordingly + /* has any child in this job already registered? */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + chd = (orte_odls_child_t*)item; + + if (chd->init_recvd) { + /* someone has registered, and we didn't before + * terminating - this is an abnormal termination */ child->state = ORTE_PROC_STATE_TERM_WO_SYNC; - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " - "but did not provide a required finalize sync - it " + "but did not provide a required init sync - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name))); goto MOVEON; } - /* if we did recv a finalize sync, then it terminated normally */ - child->state = ORTE_PROC_STATE_TERMINATED; - } else { - /* has any child in this job already registered? */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - chd = (orte_odls_child_t*)item; - - if (chd->init_recvd) { - /* someone has registered, and we didn't before - * terminating - this is an abnormal termination - */ - child->state = ORTE_PROC_STATE_TERM_WO_SYNC; - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired child process %s terminated normally " - "but did not provide a required init sync - it " - "will be treated as an abnormal termination", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - - goto MOVEON; - } - } - /* if no child has registered, then it is possible that - * none of them will. This is considered acceptable - */ - child->state = ORTE_PROC_STATE_TERMINATED; } - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired child process %s terminated normally", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + /* if no child has registered, then it is possible that + * none of them will. This is considered acceptable + */ + child->state = ORTE_PROC_STATE_TERMINATED; } + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:waitpid_fired child process %s terminated normally", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); } else { /* the process was terminated with a signal! That's definitely * abnormal, so indicate that condition diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index a494300812..7941789b01 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1405,6 +1405,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) * Determine how the process state affects the job state */ if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed proc %s failed to start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); jdata->state = ORTE_JOB_STATE_FAILED_TO_START; if (!jdata->abort) { /* point to the lowest rank to cause the problem */ @@ -1415,6 +1419,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } } else if (ORTE_PROC_STATE_ABORTED == proc->state) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); jdata->state = ORTE_JOB_STATE_ABORTED; if (!jdata->abort) { /* point to the lowest rank to cause the problem */ @@ -1425,6 +1433,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; if (!jdata->abort) { /* point to the lowest rank to cause the problem */ @@ -1435,6 +1447,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } } else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; if (!jdata->abort) { /* point to the lowest rank to cause the problem */ @@ -1451,6 +1467,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); } } else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such - just check the remaining jobs to * see if anyone is still alive @@ -1464,6 +1484,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) goto CHECK_ALL_JOBS; } else if (ORTE_PROC_STATE_UNTERMINATED < proc->state && jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed proc %s terminated and continuous", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); proc->state = ORTE_PROC_STATE_ABORTED; jdata->state = ORTE_JOB_STATE_ABORTED; if (!jdata->abort) {