From 2babebf9c32fe55dd386c0c9e95e2e77c29baa2b Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 10 Jul 2010 03:33:01 +0000 Subject: [PATCH] Revert some of a prior commit - there is no need for another flag to indicate one-sided termination of the orteds. This commit was SVN r23372. --- orte/mca/errmgr/hnp/errmgr_hnp.c | 306 +++++++++++++------------- orte/mca/plm/slurm/plm_slurm_module.c | 1 - orte/runtime/orte_globals.c | 1 - orte/runtime/orte_globals.h | 1 - 4 files changed, 154 insertions(+), 155 deletions(-) diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index fd7730b019..717729a205 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -399,14 +399,16 @@ static int update_state(orte_jobid_t job, /* if we have ordered orteds to terminate, see if this one failed to tell * us it had terminated */ - if (orte_orteds_openend_term_ordered) { - record_dead_daemon(jdata, proc->vpid, state, 0); - check_job_complete(jdata); - break; - } else if (orte_orteds_term_ordered) { - record_dead_daemon(jdata, proc->vpid, state, exit_code); - check_job_complete(jdata); - break; + if (orte_orteds_term_ordered) { + if (orte_orted_exit_with_barrier) { + record_dead_daemon(jdata, proc->vpid, state, exit_code); + check_job_complete(jdata); + break; + } else { + record_dead_daemon(jdata, proc->vpid, state, 0); + check_job_complete(jdata); + break; + } } /* if abort is in progress, see if this one failed to tell * us it had terminated @@ -770,31 +772,145 @@ static void check_job_complete(orte_job_t *jdata) } switch (proc->state) { - case ORTE_PROC_STATE_KILLED_BY_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s killed by cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - /* we ordered this proc to die, so it isn't an abnormal termination - * and we don't flag it as such - just check the remaining jobs to - * see if anyone is still alive + case ORTE_PROC_STATE_KILLED_BY_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such - just check the remaining jobs to + * see if anyone is still alive + */ + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated - now we need to check to see if ALL + * the other jobs have also completed and wakeup if that is true */ - if (jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated - now we need to check to see if ALL - * the other jobs have also completed and wakeup if that is true - */ - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; - } + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; } - goto CHECK_ALIVE; - break; - case ORTE_PROC_STATE_ABORTED: + } + goto CHECK_ALIVE; + break; + case ORTE_PROC_STATE_ABORTED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_FAILED_TO_START: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr_hnp:check_job_completed proc %s failed to start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_FAILED_TO_START; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_ABORTED_BY_SIG: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_TERM_WO_SYNC: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + /* now treat a special case - if the proc exit'd without a required + * sync, it may have done so with a zero exit code. We want to ensure + * that the user realizes there was an error, so in this -one- case, + * we overwrite the process' exit code with the default error code + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + break; + case ORTE_PROC_STATE_COMM_FAILED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_COMM_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_CALLED_ABORT: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_CALLED_ABORT; + /* point to the first proc to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + + default: + if (ORTE_PROC_STATE_UNTERMINATED < proc->state && + jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted", + "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); if (!jdata->abort) { + proc->state = ORTE_PROC_STATE_ABORTED; jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; @@ -803,122 +919,8 @@ static void check_job_complete(orte_job_t *jdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } - break; - case ORTE_PROC_STATE_FAILED_TO_START: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr_hnp:check_job_completed proc %s failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_FAILED_TO_START; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_ABORTED_BY_SIG: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted by signal", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_WO_SYNC: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated without sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - /* now treat a special case - if the proc exit'd without a required - * sync, it may have done so with a zero exit code. We want to ensure - * that the user realizes there was an error, so in this -one- case, - * we overwrite the process' exit code with the default error code - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - break; - case ORTE_PROC_STATE_COMM_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_COMM_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_CALLED_ABORT: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_CALLED_ABORT; - /* point to the first proc to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - - default: - if (ORTE_PROC_STATE_UNTERMINATED < proc->state && - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - proc->state = ORTE_PROC_STATE_ABORTED; - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - } - break; + } + break; } } @@ -1028,7 +1030,7 @@ static void check_job_complete(orte_job_t *jdata) jdata->map = NULL; } -CHECK_ALIVE: + CHECK_ALIVE: /* now check to see if all jobs are done - release this jdata * object when we find it */ @@ -1049,14 +1051,14 @@ CHECK_ALIVE: if (NULL != jdata && job->jobid == jdata->jobid && (jdata->state == ORTE_JOB_STATE_TERMINATED || jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { - /* release this object, ensuring that the - * pointer array internal accounting - * is maintained! - */ - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ - OBJ_RELEASE(jdata); - continue; - } + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ + OBJ_RELEASE(jdata); + continue; + } /* if the job is flagged to not be monitored, skip it */ if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { continue; diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index e9fc080e39..4593858bdb 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -505,7 +505,6 @@ static int plm_slurm_terminate_orteds(void) /* tell them to die without sending a reply - we will rely on the * waitpid to tell us when they have exited! */ - orte_orteds_openend_term_ordered = true; if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index befdce3074..d73a6883fc 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -96,7 +96,6 @@ bool orte_abnormal_term_ordered = false; bool orte_routing_is_enabled = false; bool orte_job_term_ordered = false; bool orte_orteds_term_ordered = false; -bool orte_orteds_openend_term_ordered = false; int orte_startup_timeout; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index d517a96278..bdbbd12c22 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -604,7 +604,6 @@ ORTE_DECLSPEC extern bool orte_abnormal_term_ordered; ORTE_DECLSPEC extern bool orte_routing_is_enabled; ORTE_DECLSPEC extern bool orte_job_term_ordered; ORTE_DECLSPEC extern bool orte_orteds_term_ordered; -ORTE_DECLSPEC extern bool orte_orteds_openend_term_ordered; ORTE_DECLSPEC extern int orte_startup_timeout;