Change the default to "abort" the job when any process exits with a non-zero status. Add the required code to ensure the orted tells the HNP about the problem.
This commit was SVN r26270.
Этот коммит содержится в:
родитель
81d7fcaf82
Коммит
ddfbde587f
@ -348,6 +348,50 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* treat this as normal termination */
|
||||
goto REPORT_STATE;
|
||||
}
|
||||
/* report this as abnormal termination to the HNP */
|
||||
alert = OBJ_NEW(opal_buffer_t);
|
||||
/* pack update state command */
|
||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* pack only the data for this proc - have to start with the jobid
|
||||
* so the receiver can unpack it correctly
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
child->state = state;
|
||||
/* now pack the child's info */
|
||||
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* remove the child from our local array as it is no longer alive */
|
||||
opal_pointer_array_set_item(orte_local_children, i, NULL);
|
||||
/* Decrement the number of local procs */
|
||||
jdata->num_local_procs--;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name),
|
||||
jdata->num_local_procs));
|
||||
|
||||
/* release the child object */
|
||||
OBJ_RELEASE(child);
|
||||
|
||||
/* send it */
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||
ORTE_RML_TAG_PLM, 0,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == state ||
|
||||
|
@ -496,7 +496,7 @@ int orte_register_params(void)
|
||||
|
||||
mca_base_param_reg_int_name("orte", "abort_on_non_zero_status",
|
||||
"Abort the job if any process returns a non-zero exit status - no restart in such cases",
|
||||
false, false, (int)false, &value);
|
||||
false, false, (int)true, &value);
|
||||
orte_abort_non_zero_exit = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "report_child_jobs_separately",
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user