1
1

Fix a problem where we need to abort due to a mapping failure, but we are in a managed environment and thus the orteds have not wired up. Thus, if we send the exit message across the routed network, the remote daemons won't have a way to relay the message along - and we won't exit.

If we are aborting, then set the flags so the HNP directly sends an exit command to each daemon. Make it the halt_vm command so the remote daemon doesn't try to relay it, but instead just exits without waiting for its routed children to exit first.

cmr=v1.8.1:reviewer=jsquyres:subject=fix hangs due to abort prior to daemon wireup

This commit was SVN r31304.
Этот коммит содержится в:
Ralph Castain 2014-04-02 04:17:55 +00:00
родитель f133b6b693
Коммит 3fdcaeab97
8 изменённых файлов: 80 добавлений и 12 удалений

Просмотреть файл

@ -161,8 +161,16 @@ static void job_errors(int fd, short args, void *cbdata)
orte_job_state_to_str(jobstate)));
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate) {
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
orte_never_launched = true;
/* disable routing as we may not have performed the daemon
* wireup - e.g., in a managed environment, all the daemons
* "phone home", but don't actually wireup into the routed
* network until they receive the launch message
*/
orte_routing_is_enabled = false;
jdata->num_terminated = jdata->num_procs;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
OBJ_RELEASE(caddy);

Просмотреть файл

@ -73,18 +73,31 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
{
int rc;
opal_buffer_t *cmd;
orte_daemon_cmd_flag_t cmmnd;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:orted_cmd sending orted_exit commands",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* flag that orteds are being terminated */
orte_orteds_term_ordered = true;
cmmnd = command;
/* if we are terminating before launch, or abnormally
* terminating, then the daemons may not be wired up
* and therefore cannot depend on detecting their
* routed children to determine termination
*/
if (orte_abnormal_term_ordered ||
orte_never_launched ||
!orte_routing_is_enabled) {
cmmnd = ORTE_DAEMON_HALT_VM_CMD;
}
/* send it express delivery! */
cmd = OBJ_NEW(opal_buffer_t);
/* pack the command */
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &cmmnd, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;

Просмотреть файл

@ -172,11 +172,13 @@ typedef int32_t orte_job_state_t;
#define ORTE_JOB_STATE_REPORT_PROGRESS (ORTE_JOB_STATE_ERROR + 17) /* report launch progress - not an error */
#define ORTE_JOB_STATE_ALLOC_FAILED (ORTE_JOB_STATE_ERROR + 18) /* job failed to obtain an allocation */
#define ORTE_JOB_STATE_MAP_FAILED (ORTE_JOB_STATE_ERROR + 19) /* job failed to map */
#define ORTE_JOB_STATE_CANNOT_LAUNCH (ORTE_JOB_STATE_ERROR + 20) /* resources were busy and so the job cannot be launched */
/* define an FT event */
#define ORTE_JOB_STATE_FT_CHECKPOINT (ORTE_JOB_STATE_ERROR + 19)
#define ORTE_JOB_STATE_FT_CONTINUE (ORTE_JOB_STATE_ERROR + 20)
#define ORTE_JOB_STATE_FT_RESTART (ORTE_JOB_STATE_ERROR + 21)
#define ORTE_JOB_STATE_FT_CHECKPOINT (ORTE_JOB_STATE_ERROR + 21)
#define ORTE_JOB_STATE_FT_CONTINUE (ORTE_JOB_STATE_ERROR + 22)
#define ORTE_JOB_STATE_FT_RESTART (ORTE_JOB_STATE_ERROR + 23)
/* Define a boundary so that external developers

Просмотреть файл

@ -287,3 +287,9 @@ impact performance when combined with the requested binding
operation. We will continue, but will not bind the processes.
This warning can be omitted by adding the "overload-allowed"
qualifier to the binding policy.
#
[cannot-launch]
Although we were able to map your job, we are unable to launch
it at this time due to required resources being busy. Please
try again later.

Просмотреть файл

@ -283,7 +283,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
*/
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
ORTE_ERROR_LOG(rc);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
OBJ_RELEASE(caddy);
return;
}
@ -292,6 +292,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* the map was done but nothing could be mapped
* for launch as all the resources were busy
*/
orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_CANNOT_LAUNCH);
OBJ_RELEASE(caddy);
return;
}
@ -301,7 +303,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
*/
if (!did_map || 0 == jdata->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
OBJ_RELEASE(caddy);
return;
}
@ -309,7 +311,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
OBJ_RELEASE(caddy);
return;
}
@ -318,7 +320,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* compute and save bindings */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
ORTE_ERROR_LOG(rc);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
OBJ_RELEASE(caddy);
return;
}

Просмотреть файл

@ -50,7 +50,7 @@ void orte_routed_base_xcast_routing(orte_grpcomm_collective_t *coll,
* then send it directly to everyone
*/
if (ORTE_PROC_IS_HNP) {
if (orte_abnormal_term_ordered) {
if (orte_abnormal_term_ordered || !orte_routing_is_enabled) {
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
for (i=1; i < daemons->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, i))) {

Просмотреть файл

@ -456,6 +456,38 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
return;
break;
case ORTE_DAEMON_HALT_VM_CMD:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received halt_vm cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* kill the local procs */
orte_odls.kill_local_procs(NULL);
/* flag that orteds were ordered to terminate */
orte_orteds_term_ordered = true;
if (ORTE_PROC_IS_HNP) {
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
proct->alive) {
/* at least one is still alive */
return;
}
}
/* call our appropriate exit procedure */
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: all routes and children gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
} else {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
return;
break;
/**** SPAWN JOB COMMAND ****/
case ORTE_DAEMON_SPAWN_JOB_CMD:
if (orte_debug_daemons_flag) {

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -298,6 +299,10 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "REPORT PROGRESS";
case ORTE_JOB_STATE_ALLOC_FAILED:
return "ALLOCATION FAILED";
case ORTE_JOB_STATE_MAP_FAILED:
return "MAP FAILED";
case ORTE_JOB_STATE_CANNOT_LAUNCH:
return "CANNOT LAUNCH";
case ORTE_JOB_STATE_FT_CHECKPOINT:
return "FAULT TOLERANCE CHECKPOINT";
case ORTE_JOB_STATE_FT_CONTINUE: