Fix abnormal shutdown when a node dies
Этот коммит содержится в:
родитель
9da29c3621
Коммит
bc7a7f3de5
@ -59,3 +59,14 @@ of factors, including an inability to create a connection back
|
||||
to mpirun due to a lack of common network interfaces and/or no
|
||||
route found between them. Please check network connectivity
|
||||
(including firewalls and network routing requirements).
|
||||
#
|
||||
[node-died]
|
||||
ORTE has lost communication with its daemon located on node:
|
||||
|
||||
hostname: %s
|
||||
|
||||
This is usually due to either a failure of the TCP network
|
||||
connection to the node, or possibly an internal failure of
|
||||
the daemon itself. We cannot recover from this failure, and
|
||||
therefore will terminate the job.
|
||||
|
||||
|
@ -361,12 +361,16 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* record the first one to fail */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* output an error message so the user knows what happened */
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
|
||||
/* mark the daemon job as failed */
|
||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* update our exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
}
|
||||
/* abort the system */
|
||||
|
@ -300,31 +300,81 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
"%s errmgr:default:orted daemon %s exited",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
/* are any of my children still alive */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||
"%s errmgr:default:orted[%s(%d)] proc %s is alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
__FILE__, __LINE__,
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
goto cleanup;
|
||||
/* if we are using static ports, then it is possible that the HNP
|
||||
* will not see this termination. So if the HNP didn't order us
|
||||
* to terminate, then we should ensure it knows */
|
||||
if (orte_static_ports && !orte_orteds_term_ordered) {
|
||||
/* send an alert to the HNP */
|
||||
alert = OBJ_NEW(opal_buffer_t);
|
||||
/* pack update state command */
|
||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* get the proc_t */
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto cleanup;
|
||||
}
|
||||
/* pack only the data for this daemon - have to start with the jobid
|
||||
* so the receiver can unpack it correctly
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* now pack the daemon's info */
|
||||
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
/* send it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_orted reporting lost connection to daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||
ORTE_RML_TAG_PLM,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(alert);
|
||||
}
|
||||
/* mark that we notified the HNP for this job so we don't do it again */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
|
||||
/* continue on */
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (orte_orteds_term_ordered) {
|
||||
/* are any of my children still alive */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||
"%s errmgr:default:orted[%s(%d)] proc %s is alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
__FILE__, __LINE__,
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if all my routes and children are gone, then terminate
|
||||
ourselves nicely (i.e., this is a normal termination) */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:orted all routes gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:orted not exiting, num_routes() == %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)orte_routed.num_routes()));
|
||||
/* if all my routes and children are gone, then terminate
|
||||
ourselves nicely (i.e., this is a normal termination) */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:orted all routes gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:orted not exiting, num_routes() == %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)orte_routed.num_routes()));
|
||||
}
|
||||
}
|
||||
/* if not, then we can continue */
|
||||
goto cleanup;
|
||||
|
@ -431,10 +431,6 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
||||
bool timing_same_as_hdr = false;
|
||||
#endif
|
||||
|
||||
if (orte_abnormal_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp:recv:handler called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user