From ddfbde587fcc886eb8e044c4fab7d22f10f93f34 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 13 Apr 2012 21:19:46 +0000 Subject: [PATCH] Change the default to "abort" the job when any process exits with a non-zero status. Add the required code to ensure the orted tells the HNP about the problem. This commit was SVN r26270. --- .../default_orted/errmgr_default_orted.c | 44 +++++++++++++++++++ orte/runtime/orte_mca_params.c | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index 2ed26e8b73..5202e29596 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -348,6 +348,50 @@ static void proc_errors(int fd, short args, void *cbdata) /* treat this as normal termination */ goto REPORT_STATE; } + /* report this as abnormal termination to the HNP */ + alert = OBJ_NEW(opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + return; + } + /* pack only the data for this proc - have to start with the jobid + * so the receiver can unpack it correctly + */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return; + } + + child->state = state; + /* now pack the child's info */ + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return; + } + /* remove the child from our local array as it is no longer alive */ + opal_pointer_array_set_item(orte_local_children, i, NULL); + /* Decrement the number of local procs */ + jdata->num_local_procs--; + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child->name), + jdata->num_local_procs)); + + /* release the child object */ + OBJ_RELEASE(child); + + /* send it */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + } + return; } if (ORTE_PROC_STATE_FAILED_TO_START == state || diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 15aea91141..71cda1504a 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -496,7 +496,7 @@ int orte_register_params(void) mca_base_param_reg_int_name("orte", "abort_on_non_zero_status", "Abort the job if any process returns a non-zero exit status - no restart in such cases", - false, false, (int)false, &value); + false, false, (int)true, &value); orte_abort_non_zero_exit = OPAL_INT_TO_BOOL(value); mca_base_param_reg_int_name("orte", "report_child_jobs_separately",