1
1

Fix abnormal shutdown when a node dies

Этот коммит содержится в:
Ralph Castain 2015-05-22 17:29:06 -07:00
родитель 9da29c3621
Коммит bc7a7f3de5
4 изменённых файлов: 88 добавлений и 27 удалений

Просмотреть файл

@ -59,3 +59,14 @@ of factors, including an inability to create a connection back
to mpirun due to a lack of common network interfaces and/or no
route found between them. Please check network connectivity
(including firewalls and network routing requirements).
#
[node-died]
ORTE has lost communication with its daemon located on node:
hostname: %s
This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and
therefore will terminate the job.

Просмотреть файл

@ -361,12 +361,16 @@ static void proc_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* record the first one to fail */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* output an error message so the user knows what happened */
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
/* mark the daemon job as failed */
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
/* update our exit code */
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
}
/* abort the system */

Просмотреть файл

@ -300,31 +300,81 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s errmgr:default:orted daemon %s exited",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* are any of my children still alive */
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s errmgr:default:orted[%s(%d)] proc %s is alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&child->name)));
goto cleanup;
/* if we are using static ports, then it is possible that the HNP
* will not see this termination. So if the HNP didn't order us
* to terminate, then we should ensure it knows */
if (orte_static_ports && !orte_orteds_term_ordered) {
/* send an alert to the HNP */
alert = OBJ_NEW(opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
/* get the proc_t */
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto cleanup;
}
/* pack only the data for this daemon - have to start with the jobid
* so the receiver can unpack it correctly
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return;
}
/* now pack the daemon's info */
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return;
}
/* send it */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_orted reporting lost connection to daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(alert);
}
/* mark that we notified the HNP for this job so we don't do it again */
orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
/* continue on */
goto cleanup;
}
if (orte_orteds_term_ordered) {
/* are any of my children still alive */
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s errmgr:default:orted[%s(%d)] proc %s is alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&child->name)));
goto cleanup;
}
}
}
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} else {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted not exiting, num_routes() == %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes()));
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} else {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted not exiting, num_routes() == %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes()));
}
}
/* if not, then we can continue */
goto cleanup;

Просмотреть файл

@ -431,10 +431,6 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
bool timing_same_as_hdr = false;
#endif
if (orte_abnormal_term_ordered) {
return;
}
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s:tcp:recv:handler called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),