From bc7a7f3de557ee568d782288efce6b9cf2b7e46e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 22 May 2015 17:29:06 -0700 Subject: [PATCH] Fix abnormal shutdown when a node dies --- orte/mca/errmgr/base/help-errmgr-base.txt | 11 +++ .../errmgr/default_hnp/errmgr_default_hnp.c | 4 + .../default_orted/errmgr_default_orted.c | 96 ++++++++++++++----- orte/mca/oob/tcp/oob_tcp_sendrecv.c | 4 - 4 files changed, 88 insertions(+), 27 deletions(-) diff --git a/orte/mca/errmgr/base/help-errmgr-base.txt b/orte/mca/errmgr/base/help-errmgr-base.txt index bdf0c40b9d..cef3e78c07 100644 --- a/orte/mca/errmgr/base/help-errmgr-base.txt +++ b/orte/mca/errmgr/base/help-errmgr-base.txt @@ -59,3 +59,14 @@ of factors, including an inability to create a connection back to mpirun due to a lack of common network interfaces and/or no route found between them. Please check network connectivity (including firewalls and network routing requirements). +# +[node-died] +ORTE has lost communication with its daemon located on node: + + hostname: %s + +This is usually due to either a failure of the TCP network +connection to the node, or possibly an internal failure of +the daemon itself. We cannot recover from this failure, and +therefore will terminate the job. + diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index a48fcbdaa5..9b2525eec7 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -361,12 +361,16 @@ static void proc_errors(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { + /* output an error message so the user knows what happened */ + orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name); + /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); + /* update our exit code */ ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abort the system */ diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index 7495e80db5..1da802588c 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -300,31 +300,81 @@ static void proc_errors(int fd, short args, void *cbdata) "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* are any of my children still alive */ - for (i=0; i < orte_local_children->size; i++) { - if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { - if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { - OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, - "%s errmgr:default:orted[%s(%d)] proc %s is alive", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&child->name))); - goto cleanup; + /* if we are using static ports, then it is possible that the HNP + * will not see this termination. So if the HNP didn't order us + * to terminate, then we should ensure it knows */ + if (orte_static_ports && !orte_orteds_term_ordered) { + /* send an alert to the HNP */ + alert = OBJ_NEW(opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + return; + } + /* get the proc_t */ + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto cleanup; + } + /* pack only the data for this daemon - have to start with the jobid + * so the receiver can unpack it correctly + */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return; + } + + /* now pack the daemon's info */ + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return; + } + /* send it */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, + "%s errmgr:default_orted reporting lost connection to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + } + /* mark that we notified the HNP for this job so we don't do it again */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); + /* continue on */ + goto cleanup; + } + + if (orte_orteds_term_ordered) { + /* are any of my children still alive */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { + OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, + "%s errmgr:default:orted[%s(%d)] proc %s is alive", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + __FILE__, __LINE__, + ORTE_NAME_PRINT(&child->name))); + goto cleanup; + } } } - } - /* if all my routes and children are gone, then terminate - ourselves nicely (i.e., this is a normal termination) */ - if (0 == orte_routed.num_routes()) { - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, - "%s errmgr:default:orted all routes gone - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); - } else { - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, - "%s errmgr:default:orted not exiting, num_routes() == %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)orte_routed.num_routes())); + /* if all my routes and children are gone, then terminate + ourselves nicely (i.e., this is a normal termination) */ + if (0 == orte_routed.num_routes()) { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, + "%s errmgr:default:orted all routes gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } else { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, + "%s errmgr:default:orted not exiting, num_routes() == %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int)orte_routed.num_routes())); + } } /* if not, then we can continue */ goto cleanup; diff --git a/orte/mca/oob/tcp/oob_tcp_sendrecv.c b/orte/mca/oob/tcp/oob_tcp_sendrecv.c index f75827a7f3..35e72a702e 100644 --- a/orte/mca/oob/tcp/oob_tcp_sendrecv.c +++ b/orte/mca/oob/tcp/oob_tcp_sendrecv.c @@ -431,10 +431,6 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) bool timing_same_as_hdr = false; #endif - if (orte_abnormal_term_ordered) { - return; - } - opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),