Fix a problem where we need to abort due to a mapping failure, but we are in a managed environment and thus the orteds have not wired up. Thus, if we send the exit message across the routed network, the remote daemons won't have a way to relay the message along - and we won't exit.
If we are aborting, then set the flags so the HNP directly sends an exit command to each daemon. Make it the halt_vm command so the remote daemon doesn't try to relay it, but instead just exits without waiting for its routed children to exit first. cmr=v1.8.1:reviewer=jsquyres:subject=fix hangs due to abort prior to daemon wireup This commit was SVN r31304.
Этот коммит содержится в:
родитель
f133b6b693
Коммит
3fdcaeab97
@ -161,8 +161,16 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
orte_job_state_to_str(jobstate)));
|
||||
|
||||
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
|
||||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate) {
|
||||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
|
||||
orte_never_launched = true;
|
||||
/* disable routing as we may not have performed the daemon
|
||||
* wireup - e.g., in a managed environment, all the daemons
|
||||
* "phone home", but don't actually wireup into the routed
|
||||
* network until they receive the launch message
|
||||
*/
|
||||
orte_routing_is_enabled = false;
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
OBJ_RELEASE(caddy);
|
||||
|
@ -73,18 +73,31 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
|
||||
{
|
||||
int rc;
|
||||
opal_buffer_t *cmd;
|
||||
|
||||
orte_daemon_cmd_flag_t cmmnd;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:orted_cmd sending orted_exit commands",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* flag that orteds are being terminated */
|
||||
orte_orteds_term_ordered = true;
|
||||
|
||||
cmmnd = command;
|
||||
|
||||
/* if we are terminating before launch, or abnormally
|
||||
* terminating, then the daemons may not be wired up
|
||||
* and therefore cannot depend on detecting their
|
||||
* routed children to determine termination
|
||||
*/
|
||||
if (orte_abnormal_term_ordered ||
|
||||
orte_never_launched ||
|
||||
!orte_routing_is_enabled) {
|
||||
cmmnd = ORTE_DAEMON_HALT_VM_CMD;
|
||||
}
|
||||
|
||||
/* send it express delivery! */
|
||||
cmd = OBJ_NEW(opal_buffer_t);
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &cmmnd, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
|
@ -172,11 +172,13 @@ typedef int32_t orte_job_state_t;
|
||||
|
||||
#define ORTE_JOB_STATE_REPORT_PROGRESS (ORTE_JOB_STATE_ERROR + 17) /* report launch progress - not an error */
|
||||
#define ORTE_JOB_STATE_ALLOC_FAILED (ORTE_JOB_STATE_ERROR + 18) /* job failed to obtain an allocation */
|
||||
#define ORTE_JOB_STATE_MAP_FAILED (ORTE_JOB_STATE_ERROR + 19) /* job failed to map */
|
||||
#define ORTE_JOB_STATE_CANNOT_LAUNCH (ORTE_JOB_STATE_ERROR + 20) /* resources were busy and so the job cannot be launched */
|
||||
|
||||
/* define an FT event */
|
||||
#define ORTE_JOB_STATE_FT_CHECKPOINT (ORTE_JOB_STATE_ERROR + 19)
|
||||
#define ORTE_JOB_STATE_FT_CONTINUE (ORTE_JOB_STATE_ERROR + 20)
|
||||
#define ORTE_JOB_STATE_FT_RESTART (ORTE_JOB_STATE_ERROR + 21)
|
||||
#define ORTE_JOB_STATE_FT_CHECKPOINT (ORTE_JOB_STATE_ERROR + 21)
|
||||
#define ORTE_JOB_STATE_FT_CONTINUE (ORTE_JOB_STATE_ERROR + 22)
|
||||
#define ORTE_JOB_STATE_FT_RESTART (ORTE_JOB_STATE_ERROR + 23)
|
||||
|
||||
|
||||
/* Define a boundary so that external developers
|
||||
|
@ -287,3 +287,9 @@ impact performance when combined with the requested binding
|
||||
operation. We will continue, but will not bind the processes.
|
||||
This warning can be omitted by adding the "overload-allowed"
|
||||
qualifier to the binding policy.
|
||||
#
|
||||
[cannot-launch]
|
||||
Although we were able to map your job, we are unable to launch
|
||||
it at this time due to required resources being busy. Please
|
||||
try again later.
|
||||
|
||||
|
@ -283,7 +283,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
*/
|
||||
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
@ -292,6 +292,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
/* the map was done but nothing could be mapped
|
||||
* for launch as all the resources were busy
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_CANNOT_LAUNCH);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
@ -301,7 +303,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
*/
|
||||
if (!did_map || 0 == jdata->num_procs) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
@ -309,7 +311,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
@ -318,7 +320,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
/* compute and save bindings */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
@ -50,7 +50,7 @@ void orte_routed_base_xcast_routing(orte_grpcomm_collective_t *coll,
|
||||
* then send it directly to everyone
|
||||
*/
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
if (orte_abnormal_term_ordered) {
|
||||
if (orte_abnormal_term_ordered || !orte_routing_is_enabled) {
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
for (i=1; i < daemons->procs->size; i++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, i))) {
|
||||
|
@ -456,6 +456,38 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
return;
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_HALT_VM_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received halt_vm cmd",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
/* kill the local procs */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
/* flag that orteds were ordered to terminate */
|
||||
orte_orteds_term_ordered = true;
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* if all my routes and local children are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
proct->alive) {
|
||||
/* at least one is still alive */
|
||||
return;
|
||||
}
|
||||
}
|
||||
/* call our appropriate exit procedure */
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: all routes and children gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
} else {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
return;
|
||||
break;
|
||||
|
||||
/**** SPAWN JOB COMMAND ****/
|
||||
case ORTE_DAEMON_SPAWN_JOB_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -298,6 +299,10 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
return "REPORT PROGRESS";
|
||||
case ORTE_JOB_STATE_ALLOC_FAILED:
|
||||
return "ALLOCATION FAILED";
|
||||
case ORTE_JOB_STATE_MAP_FAILED:
|
||||
return "MAP FAILED";
|
||||
case ORTE_JOB_STATE_CANNOT_LAUNCH:
|
||||
return "CANNOT LAUNCH";
|
||||
case ORTE_JOB_STATE_FT_CHECKPOINT:
|
||||
return "FAULT TOLERANCE CHECKPOINT";
|
||||
case ORTE_JOB_STATE_FT_CONTINUE:
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user