1
1

Upgrade the node/orted failure detection code to cover all environments. Use the native environment's capabilities where possible - e.g., SLURM detects orted failure and can report it. Elsewhere, use a heartbeat system to detect orted failure - e.g., for TM and rsh. Heart rate is set via mca param. The HNP checks for callback every 2*heartrate, declares orted failure if not seen in last 2*heartrate time.

Also detect orted failed-to-start by setting timeout on launch. Currently only used in TM launcher.

Neither detection is enabled by default, but are only active if heartrate is set and/or launch timeout is set. Exception for SLURM as orted failure is always detected and reported.

More info to come on devel list.

This commit was SVN r18555.
Этот коммит содержится в:
Ralph Castain 2008-06-02 21:46:34 +00:00
родитель 69d78c6739
Коммит b456fb2d42
21 изменённых файлов: 489 добавлений и 192 удалений

Просмотреть файл

@ -268,7 +268,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"alps",
&proc_vpid_index,
NULL);
NULL, false);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end

Просмотреть файл

@ -30,4 +30,6 @@ libmca_plm_la_SOURCES += \
base/plm_base_launch_support.c \
base/plm_base_jobid.c \
base/plm_base_proxy.c \
base/plm_base_orted_cmds.c
base/plm_base_orted_cmds.c \
base/plm_base_heartbeat.c

135
orte/mca/plm/base/plm_base_heartbeat.c Обычный файл
Просмотреть файл

@ -0,0 +1,135 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/output.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/mca/plm/base/plm_private.h"
void orte_plm_base_heartbeat(int fd, short event, void *data)
{
opal_buffer_t buf;
orte_plm_cmd_flag_t command = ORTE_PLM_HEARTBEAT_CMD;
int rc;
/* setup the buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* tell the HNP this is a heartbeat */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* send heartbeat to HNP */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* reset the timer */
ORTE_TIMER_EVENT(orte_heartbeat_rate, orte_plm_base_heartbeat);
CLEANUP:
OBJ_DESTRUCT(&buf);
}
#define HEARTBEAT_CK 2
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various orteds
*/
static void check_heartbeat(int fd, short dummy, void *arg)
{
orte_vpid_t v;
orte_proc_t **procs;
orte_job_t *daemons;
struct timeval timeout;
bool died = false;
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:base:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_shutdown_in_progress) {
return;
}
/* get the job object for the daemons */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
procs = (orte_proc_t**)daemons->procs->addr;
/* get current time */
gettimeofday(&timeout, NULL);
/* cycle through the daemons - make sure we check them all
* in case multiple daemons died so all of those that did die
* can be appropriately flagged
*/
for (v=1; v < daemons->num_procs; v++) {
if ((timeout.tv_sec - procs[v]->beat) > HEARTBEAT_CK*orte_heartbeat_rate) {
/* declare this orted dead */
procs[v]->state = ORTE_PROC_STATE_ABORTED;
procs[v]->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE;
if (NULL == daemons->aborted_proc) {
daemons->aborted_proc = procs[v];
}
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
died = true;
}
}
/* if any daemon died, abort */
if (died) {
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, false, -1,
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_ABORTED);
return;
}
/* reset the timer */
ORTE_TIMER_EVENT(HEARTBEAT_CK*orte_heartbeat_rate, check_heartbeat);
}
void orte_plm_base_start_heart(void)
{
/* if the heartbeat rate > 0, then start the heart */
if (0 < orte_heartbeat_rate) {
ORTE_TIMER_EVENT(HEARTBEAT_CK*orte_heartbeat_rate, check_heartbeat);
}
}

Просмотреть файл

@ -45,6 +45,7 @@
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
@ -229,47 +230,60 @@ void orte_plm_base_launch_failed(orte_jobid_t job, bool daemons_launching, pid_t
orte_job_t *jdata;
char *pidstr;
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:base:launch_failed abort in progress, ignoring report",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
}
ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch_failed for job %s during %s",
"%s plm:base:launch_failed for job %s %s daemon launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
(daemons_launching) ? "daemon launch" : "app launch"));
(daemons_launching) ? "during" : "after"));
if (0 < pid) {
asprintf(&pidstr, "%d", (int)pid);
} else {
/* if the pid is negative, then we couldn't get a real pid
* to report here - so tell someone that
*/
pidstr = strdup("unknown");
}
if (daemons_launching) {
if (WIFSIGNALED(status)) { /* died on signal */
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
pidstr, WTERMSIG(status));
} else {
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
pidstr, WTERMSIG(status));
}
#else
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
pidstr, WTERMSIG(status));
#endif /* WCOREDUMP */
} else {
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
pidstr, WEXITSTATUS(status));
}
orted_failed_launch = true;
/* if this is the daemon job that failed, set the flag indicating
* that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
*/
if (ORTE_PROC_MY_NAME->jobid == job) {
/* set the flag indicating that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
*/
orte_abnormal_term_ordered = true;
}
free(pidstr);
if (0 < pid) {
asprintf(&pidstr, "%d", (int)pid);
} else {
/* if the pid is negative, then we couldn't get a real pid
* to report here - so tell someone that
*/
pidstr = strdup("unknown");
}
if (daemons_launching) {
if (WIFSIGNALED(status)) { /* died on signal */
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
pidstr, WTERMSIG(status));
} else {
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
pidstr, WTERMSIG(status));
}
#else
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
pidstr, WTERMSIG(status));
#endif /* WCOREDUMP */
} else {
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
pidstr, WEXITSTATUS(status));
}
orted_failed_launch = true;
}
free(pidstr);
}
/* Set the job state as indicated so orterun's exit status
will be non-zero
@ -669,7 +683,8 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *ess,
int *proc_vpid_index,
int *node_name_index)
int *node_name_index,
bool heartbeat)
{
char *param = NULL;
int loc_id;
@ -700,6 +715,19 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, param);
free(param);
}
if (0 < orted_debug_failure_delay) {
opal_argv_append(argc, argv, "--debug-failure-delay");
asprintf(&param, "%d", orted_debug_failure_delay);
opal_argv_append(argc, argv, param);
free(param);
}
if (heartbeat) {
/* tell the daemon to do a heartbeat */
opal_argv_append(argc, argv, "--heartbeat");
asprintf(&param, "%d", orte_heartbeat_rate);
opal_argv_append(argc, argv, param);
free(param);
}
/* tell the orted what SDS component to use */
opal_argv_append(argc, argv, "-mca");
@ -915,9 +943,14 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
"%s plm:base:check_job_completed declared job %s failed to start by proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&(jdata->aborted_proc->name))));
(NULL == jdata->aborted_proc) ? "unknown" : ORTE_NAME_PRINT(&(jdata->aborted_proc->name))));
/* report this to the errmgr - it will protect us from multiple calls */
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
if (NULL == jdata->aborted_proc) {
/* we don't know who caused us to abort */
orte_errmgr.incomplete_start(jdata->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
}
goto CHECK_ALL_JOBS;
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
@ -926,10 +959,15 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
"%s plm:base:check_job_completed declared job %s aborted by proc %s with code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&(jdata->aborted_proc->name)),
jdata->aborted_proc->exit_code));
(NULL == jdata->aborted_proc) ? "unknown" : ORTE_NAME_PRINT(&(jdata->aborted_proc->name)),
(NULL == jdata->aborted_proc) ? ORTE_ERROR_DEFAULT_EXIT_CODE : jdata->aborted_proc->exit_code));
/* report this to the errmgr */
orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code);
if (NULL == jdata->aborted_proc) {
/* we don't know who caused us to abort */
orte_errmgr.proc_aborted(ORTE_NAME_INVALID, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code);
}
goto CHECK_ALL_JOBS;
} else if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated - now we need to check to see if ALL
@ -945,24 +983,14 @@ CHECK_ALL_JOBS:
/* if the job that is being checked is the HNP, then we are
* trying to terminate the orteds. In that situation, we
* do -not- check all jobs - we simply notify the HNP
* that the orteds are complete
*
* NOTE: remember to protect against jdata being NULL!
*/
if (NULL != jdata &&
jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* we have completed the orteds */
int data=1;
write(orteds_exit, &data, sizeof(int));
return;
}
/* now check special case if jdata is NULL - we want
* that the orteds are complete. Also check special case
* if jdata is NULL - we want
* to definitely declare the job done if the orteds
* have completed, no matter what else may be happening.
* This can happen if a ctrl-c hits in the "wrong" place
* while launching
*/
if (jdata == NULL) {
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (jdata->num_terminated >= jdata->num_procs) {
/* orteds are done! */

Просмотреть файл

@ -90,7 +90,7 @@ int orte_plm_base_orted_exit(void)
{
int rc;
opal_buffer_t cmd;
orte_daemon_cmd_flag_t command;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_EXIT_CMD;
orte_job_t *daemons;
orte_proc_t **procs;
@ -98,6 +98,9 @@ int orte_plm_base_orted_exit(void)
"%s plm:base:orted_cmd sending orted_exit commands",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* flag that a shutdown is in progress so all heartbeats stop */
orte_shutdown_in_progress = true;
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
/* since the orteds are being ordered to exit, and we are
@ -117,16 +120,7 @@ int orte_plm_base_orted_exit(void)
procs[0]->state = ORTE_PROC_STATE_TERMINATED;
daemons->num_terminated++;
/* just to be sure - it could be that we are the only daemon in
* the job, and/or that all the other daemons reported termination
* due to some other influence, so check to see if we are all done.
* This will wake us up if everything is done
*/
orte_plm_base_check_job_completed(daemons);
command = ORTE_DAEMON_EXIT_CMD;
/* pack the command */
/* pack the command */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&cmd);
@ -154,25 +148,26 @@ int orte_plm_base_orted_exit(void)
*/
done_reporting = false;
num_reported = 0;
num_being_sent = 0;
num_being_sent = daemons->num_procs-1;
peer.jobid = ORTE_PROC_MY_NAME->jobid;
for(v=1; v < daemons->num_procs; v++) {
/* if we don't have contact info for this daemon,
* then we know we can't reach it - so don't try
*/
if (NULL == procs[v]->rml_uri) {
--num_being_sent;
continue;
}
peer.vpid = v;
/* check to see if this daemon is known to be "dead" */
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
/* don't try to send this */
--num_being_sent;
continue;
}
/* don't worry about errors on the send here - just
* issue it and keep going
*/
++num_being_sent;
ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_cmd:orted_exit sending cmd to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -187,8 +182,8 @@ int orte_plm_base_orted_exit(void)
* our best attempt
*/
ORTE_DETECT_TIMEOUT(&ev, num_being_sent,
1000*orte_timeout_usec_per_proc,
10*orte_max_timeout, failed_send);
orte_timeout_usec_per_proc,
orte_max_timeout, failed_send);
/* wait for completion or timeout */
ORTE_PROGRESSED_WAIT(done_reporting, num_reported, num_being_sent);
@ -199,8 +194,10 @@ int orte_plm_base_orted_exit(void)
ev = NULL;
}
/* if all the sends didn't go, report that */
if (num_reported < num_being_sent) {
/* if all the sends didn't go, or we couldn't send to
* all daemons, then report that */
if (num_reported < num_being_sent ||
num_being_sent < (daemons->num_procs-1)) {
return ORTE_ERR_SILENT;
}
@ -286,13 +283,14 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
*/
done_reporting = false;
num_reported = 0;
num_being_sent = 0;
num_being_sent = daemons->num_procs-1;
peer.jobid = ORTE_PROC_MY_NAME->jobid;
for(v=1; v < daemons->num_procs; v++) {
/* if we don't have contact info for this daemon,
* then we know we can't reach it - so don't try
*/
if (NULL == procs[v]->rml_uri) {
--num_being_sent;
continue;
}
peer.vpid = v;
@ -300,11 +298,11 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
/* don't try to send this */
continue;
--num_being_sent;
}
/* don't worry about errors on the send here - just
* issue it and keep going
*/
++num_being_sent;
ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_cmd:kill_local_procs sending cmd to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -319,8 +317,8 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
* our best attempt
*/
ORTE_DETECT_TIMEOUT(&ev, num_being_sent,
1000*orte_timeout_usec_per_proc,
10*orte_max_timeout, failed_send);
orte_timeout_usec_per_proc,
orte_max_timeout, failed_send);
/* wait for completion or timeout */
ORTE_PROGRESSED_WAIT(done_reporting, num_reported, num_being_sent);
@ -331,8 +329,10 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
ev = NULL;
}
/* if all the sends didn't go, report that */
if (num_reported < num_being_sent) {
/* if all the sends didn't go, or we couldn't send to
* all daemons, then report that */
if (num_reported < num_being_sent ||
num_being_sent < (daemons->num_procs-1)) {
return ORTE_ERR_SILENT;
}

Просмотреть файл

@ -27,6 +27,10 @@
#include "orte/constants.h"
#include "orte/types.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/class/opal_list.h"
#include "orte/util/output.h"
#include "opal/mca/mca.h"
@ -107,6 +111,7 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
orte_proc_state_t state;
orte_exit_code_t exit_code;
int rc, ret;
struct timeval beat;
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &command, &count, ORTE_PLM_CMD))) {
@ -249,6 +254,21 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
orte_plm_base_check_job_completed(jdata);
break;
case ORTE_PLM_HEARTBEAT_CMD:
ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive got heartbeat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mev->sender)));
/* lookup the daemon object */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
procs = (orte_proc_t**)jdata->procs->addr;
gettimeofday(&beat, NULL);
procs[mev->sender.vpid]->beat = beat.tv_sec;
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
return;

Просмотреть файл

@ -76,6 +76,12 @@ ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_jobid_t *jobid);
/**
* Heartbeat support
*/
ORTE_DECLSPEC void orte_plm_base_heartbeat(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_start_heart(void);
/**
* Utilities for plm components that use proxy daemons
*/
@ -99,7 +105,8 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender,
ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
char *sds,
int *proc_vpid_index,
int *node_name_index);
int *node_name_index,
bool heartbeat);
/*
* Proxy functions for use by daemons and application procs

Просмотреть файл

@ -208,7 +208,7 @@ GETMAP:
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
&node_name_index);
&node_name_index, false);
if (0 < orte_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');

Просмотреть файл

@ -223,7 +223,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"lsf",
&proc_vpid_index,
NULL);
NULL, false);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end

Просмотреть файл

@ -116,6 +116,8 @@ typedef uint8_t orte_plm_cmd_flag_t;
#define ORTE_PLM_CMD OPAL_UINT8
#define ORTE_PLM_LAUNCH_JOB_CMD 1
#define ORTE_PLM_UPDATE_PROC_STATE 2
#define ORTE_PLM_HEARTBEAT_CMD 3
END_C_DECLS

Просмотреть файл

@ -457,7 +457,8 @@ static int setup_launch(int *argcptr, char ***argvptr,
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
proc_vpid_index,
node_name_index2);
node_name_index2,
true);
if (0 < orte_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
@ -1151,6 +1152,14 @@ launch_apps:
orte_plm_base_launch_failed(jdata->jobid, false, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
}
/* setup a "heartbeat" timer to periodically check on
* the state-of-health of the orteds, if requested AND
* we actually launched some daemons!
*/
if (0 < map->num_new_daemons) {
orte_plm_base_start_heart();
}
return rc;
}

Просмотреть файл

@ -280,7 +280,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
orte_plm_base_orted_append_basic_args(&argc, &argv,
"slurm",
&proc_vpid_index,
NULL);
NULL, false);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end

Просмотреть файл

@ -106,7 +106,7 @@ static int plm_tm_open(void)
mca_plm_tm_component.want_path_check = OPAL_INT_TO_BOOL(tmp);
mca_plm_tm_component.checked_paths = NULL;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -65,6 +65,7 @@
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
@ -86,6 +87,12 @@ static int plm_tm_finalize(void);
static int plm_tm_connect(void);
static int plm_tm_disconnect(void);
static void failed_start(int fd, short event, void *arg);
/*
* Local "global" variables
*/
static opal_event_t *ev=NULL;
/*
* Global variable
@ -139,21 +146,10 @@ static int plm_tm_launch_job(orte_job_t *jdata)
tm_task_id *tm_task_ids = NULL;
int local_err;
tm_event_t event;
struct timeval launchstart, launchstop, completionstart, completionstop;
struct timeval jobstart, jobstop;
int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
float avgtime=0.0;
bool failed_launch = true;
mode_t current_umask;
/* check for timing request - get start time if so */
if (orte_timing) {
if (0 != gettimeofday(&jobstart, NULL)) {
orte_output(0, "plm_tm: could not obtain job start time");
}
}
/* create a jobid for this job */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
ORTE_ERROR_LOG(rc);
@ -206,7 +202,8 @@ static int plm_tm_launch_job(orte_job_t *jdata)
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
&node_name_index);
&node_name_index,
true);
if (0 < orte_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
@ -316,15 +313,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
if (NULL != param) free(param);
}
/* check for timing request - get start time if so */
if (orte_timing) {
if (0 != gettimeofday(&launchstart, NULL)) {
orte_output(0, "plm_tm: could not obtain start time");
launchstart.tv_sec = 0;
launchstart.tv_usec = 0;
}
}
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
if (TM_SUCCESS != rc) {
orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
@ -333,25 +321,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
goto cleanup;
}
/* check for timing request - get stop time and process if so */
if (orte_timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
orte_output(0, "plm_tm: could not obtain stop time");
} else {
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
(launchstop.tv_usec - launchstart.tv_usec);
avgtime = avgtime + deltat / map->num_new_daemons;
if (deltat < mintime) {
mintime = deltat;
miniter = launched;
}
if (deltat > maxtime) {
maxtime = deltat;
maxiter = launched;
}
}
}
launched++;
/* Allow some progress to occur */
@ -362,15 +331,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
"%s plm:tm:launch: finished spawning orteds",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* check for timing request - get start time for launch completion */
if (orte_timing) {
if (0 != gettimeofday(&completionstart, NULL)) {
orte_output(0, "plm_tm: could not obtain completion start time");
completionstart.tv_sec = 0;
completionstart.tv_usec = 0;
}
}
/* TM poll for all the spawns */
for (i = 0; i < launched; ++i) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
@ -381,6 +341,19 @@ static int plm_tm_launch_job(orte_job_t *jdata)
}
}
/* set a timer to tell us if one or more daemon's fails to start - use the
* millisec/daemon timeout provided by the user to compute time
*/
if (0 < orte_startup_timeout) {
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: setting startup timer for %d milliseconds",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_startup_timeout));
ORTE_DETECT_TIMEOUT(&ev, map->num_new_daemons,
orte_startup_timeout*1000,
-1, failed_start);
}
/* wait for daemons to callback */
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
@ -390,19 +363,9 @@ static int plm_tm_launch_job(orte_job_t *jdata)
goto cleanup;
}
/* check for timing request - get stop time for launch completion and report */
if (orte_timing) {
if (0 != gettimeofday(&completionstop, NULL)) {
orte_output(0, "plm_tm: could not obtain completion stop time");
} else {
deltat = (completionstop.tv_sec - jobstart.tv_sec)*1000000 +
(completionstop.tv_usec - completionstop.tv_usec);
orte_output(0, "plm_tm: time to launch/wireup all daemons: %d usec", deltat);
}
orte_output(0, "plm_tm: Launch statistics:");
orte_output(0, "plm_tm: Average time to launch an orted: %f usec", avgtime);
orte_output(0, "plm_tm: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter);
orte_output(0, "plm_tm: Min time to launch an orted: %d usec at iter %d", mintime, miniter);
/* if issued, cancel the failed-to-start timer */
if (NULL != ev) {
opal_event_del(ev);
}
launch_apps:
@ -446,17 +409,15 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(jdata->jobid, true, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
return rc;
}
/* check for timing request - get stop time and process if so */
if (orte_timing) {
if (0 != gettimeofday(&jobstop, NULL)) {
orte_output(0, "plm_tm: could not obtain stop time");
} else {
deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 +
(jobstop.tv_usec - jobstart.tv_usec);
orte_output(0, "plm_tm: launch of entire job required %d usec", deltat);
}
/* setup a "heartbeat" timer to periodically check on
* the state-of-health of the orteds, if requested AND
* we actually launched some daemons!
*/
if (0 < map->num_new_daemons) {
orte_plm_base_start_heart();
}
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
@ -556,3 +517,24 @@ static int plm_tm_disconnect(void)
return ORTE_SUCCESS;
}
/* call this function if the timer fires indicating that one
* or more daemons failed to start
*/
static void failed_start(int fd, short dummy, void *arg)
{
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm:failed_start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting, ignore this */
if (orte_abnormal_term_ordered) {
ORTE_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm:failed_start - abnormal term in progress",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
}
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, true, -1,
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
}

Просмотреть файл

@ -95,6 +95,7 @@ static opal_event_t *orted_exit_event;
static void shutdown_callback(int fd, short flags, void *arg);
static void signal_callback(int fd, short event, void *arg);
static void clean_fail(int fd, short flags, void *arg);
static struct {
bool debug;
@ -108,6 +109,9 @@ static struct {
int uri_pipe;
int singleton_died_pipe;
int fail;
int fail_delay;
bool abort;
int heartbeat;
} orted_globals;
/*
@ -124,12 +128,20 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
"Have the orted spin until we can connect a debugger to it" },
{ NULL, NULL, NULL, '\0', NULL, "debug-failure", 1,
&orted_globals.fail, OPAL_CMD_LINE_TYPE_INT,
&orted_globals.fail, OPAL_CMD_LINE_TYPE_INT,
"Have the specified orted fail after init for debugging purposes" },
{ NULL, NULL, NULL, '\0', NULL, "debug-failure-delay", 1,
&orted_globals.fail_delay, OPAL_CMD_LINE_TYPE_INT,
"Have the orted specified for failure delay for the provided number of seconds before failing" },
{ NULL, NULL, NULL, '\0', NULL, "heartbeat", 1,
&orted_globals.heartbeat, OPAL_CMD_LINE_TYPE_INT,
"Seconds between orted heartbeat messages to be sent to HNP (default: 0 => no heartbeat)" },
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Debug the OpenRTE" },
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Debug the OpenRTE" },
{ "orte", "daemonize", NULL, '\0', NULL, "daemonize", 0,
&orted_globals.daemonize, OPAL_CMD_LINE_TYPE_BOOL,
@ -300,14 +312,43 @@ int orte_daemon(int argc, char *argv[])
return ret;
}
if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);
if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
orted_globals.abort=false;
/* some vpid was ordered to fail. The value can be positive
* or negative, depending upon the desired method for failure,
* so need to check both here
*/
if (0 > orted_globals.fail) {
orted_globals.fail = -1*orted_globals.fail;
orted_globals.abort = true;
}
/* are we the specified vpid? */
if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
/* if the user specified we delay, then setup a timer
* and have it kill us
*/
if (0 < orted_globals.fail_delay) {
ORTE_TIMER_EVENT(orted_globals.fail_delay, clean_fail);
} else {
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orted_globals.abort ? "abort" : "abnormal termination");
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* if we were ordered to abort, do so */
if (orted_globals.abort) {
abort();
}
/* otherwise, return with non-zero status */
return ORTE_ERROR_DEFAULT_EXIT_CODE;
}
}
/* return with non-zero status */
return -1;
}
/* detach from controlling terminal
@ -569,6 +610,11 @@ int orte_daemon(int argc, char *argv[])
orte_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* if we were told to do a heartbeat, then setup to do so */
if (0 < orted_globals.heartbeat) {
ORTE_TIMER_EVENT(orted_globals.heartbeat, orte_plm_base_heartbeat);
}
/* wait to hear we are done */
opal_event_dispatch();
@ -578,12 +624,45 @@ int orte_daemon(int argc, char *argv[])
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);
}
ret = orte_finalize();
return ret;
}
static void clean_fail(int fd, short flags, void *arg)
{
/* protect against multiple calls to exit */
if (!opal_atomic_trylock(&orted_exit_lock)) { /* returns 1 if already locked */
return;
}
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orted_globals.abort ? "abort" : "abnormal termination");
/* cleanup */
if (NULL != log_path) {
unlink(log_path);
}
/* make sure our local procs are dead - but don't update their state
* on the HNP as this may be redundant
*/
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* if we were ordered to abort, do so */
if (orted_globals.abort) {
abort();
}
/* otherwise, exit with a non-zero status */
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
static void shutdown_callback(int fd, short flags, void *arg)
{
int ret;
@ -613,9 +692,7 @@ static void shutdown_callback(int fd, short flags, void *arg)
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);
}
ret = orte_finalize();
exit(ret);
}

Просмотреть файл

@ -56,7 +56,8 @@ bool orte_help_want_aggregate = true;
bool orte_help_show_recursions;
bool orte_params_set = false;
int orte_debug_verbosity;
int orted_debug_failure = ORTE_VPID_INVALID;
int orted_debug_failure;
int orted_debug_failure_delay;
int32_t orte_contiguous_nodes;
int orte_debug_output = -1;
@ -66,6 +67,10 @@ char **orted_cmd_line=NULL;
int orte_exit, orteds_exit;
int orte_exit_status = 0;
bool orte_abnormal_term_ordered = false;
bool orte_shutdown_in_progress = false;
int orte_heartbeat_rate;
int orte_startup_timeout;
int orte_timeout_usec_per_proc;
float orte_max_timeout;

Просмотреть файл

@ -254,6 +254,8 @@ struct orte_proc_t {
char *nodename;
/* RML contact info */
char *rml_uri;
/* seconds when last heartbeat was detected */
int beat;
#if OPAL_ENABLE_FT == 1
/* ckpt state */
size_t ckpt_state;
@ -326,6 +328,7 @@ ORTE_DECLSPEC extern bool orte_help_show_recursions;
ORTE_DECLSPEC extern bool orte_params_set;
ORTE_DECLSPEC extern int orte_debug_verbosity;
ORTE_DECLSPEC extern int orted_debug_failure;
ORTE_DECLSPEC extern int orted_debug_failure_delay;
ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
@ -334,6 +337,10 @@ ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern int orte_exit, orteds_exit;
ORTE_DECLSPEC extern int orte_exit_status;
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
ORTE_DECLSPEC extern bool orte_shutdown_in_progress;
ORTE_DECLSPEC extern int orte_heartbeat_rate;
ORTE_DECLSPEC extern int orte_startup_timeout;
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
ORTE_DECLSPEC extern float orte_max_timeout;

Просмотреть файл

@ -244,6 +244,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->node = NULL;
proc->nodename = NULL;
proc->rml_uri = NULL;
proc->beat = 0;
#if OPAL_ENABLE_FT == 1
proc->ckpt_state = 0;
proc->ckpt_snapshot_ref = NULL;

Просмотреть файл

@ -77,8 +77,20 @@ int orte_register_params(void)
mca_base_param_reg_int_name("orte", "daemon_fail",
"Have the specified orted fail after init for debugging purposes",
false, false, (int)false, &orted_debug_failure);
false, false, ORTE_VPID_INVALID, &orted_debug_failure);
mca_base_param_reg_int_name("orte", "daemon_fail_delay",
"Have the specified orted fail after specified number of seconds (default: 0 => no delay)",
false, false, 0, &orted_debug_failure_delay);
mca_base_param_reg_int_name("orte", "heartbeat_rate",
"Seconds between checks for daemon state-of-health (default: 0 => do not check)",
false, false, 0, &orte_heartbeat_rate);
mca_base_param_reg_int_name("orte", "startup_timeout",
"Milliseconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)",
false, false, 0, &orte_startup_timeout);
/* check for timing requests */
mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
@ -98,8 +110,8 @@ int orte_register_params(void)
orte_max_timeout = 1000000.0 * value; /* convert to usec */
mca_base_param_reg_int_name("orte", "timeout_step",
"Time to wait [in usecs/proc] before aborting an ORTE operation (default: 100 usec/proc)",
false, false, 100, &orte_timeout_usec_per_proc);
"Time to wait [in usecs/proc] before aborting an ORTE operation (default: 1000 usec/proc)",
false, false, 1000, &orte_timeout_usec_per_proc);
/* default hostfile */
mca_base_param_reg_string_name("orte", "default_hostfile",

Просмотреть файл

@ -231,8 +231,10 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
opal_evtimer_set(tmp, (cbfunc), NULL); \
now.tv_sec = 0; \
now.tv_usec = (float)(deltat) * (float)(n); \
if (now.tv_usec > (maxwait)) { \
now.tv_usec = (maxwait); \
if (maxwait > 0) { \
if (now.tv_usec > (maxwait)) { \
now.tv_usec = (maxwait); \
} \
} \
if (now.tv_usec > 1000000.0) { \
now.tv_sec = (float)((int)(now.tv_usec/1000000.0)); \

Просмотреть файл

@ -112,6 +112,7 @@ static orte_std_cntr_t total_num_apps = 0;
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
static opal_event_t *orterun_event, *orteds_exit_event;
static char *ompi_server=NULL;
static opal_event_t *abort_exit_event=NULL;
/*
* Globals
@ -568,6 +569,10 @@ static void job_completed(int trigpipe, short event, void *arg)
return;
}
/* if the abort exit event is set, delete it */
if (NULL != abort_exit_event) {
opal_event_del(abort_exit_event);
}
/* close the trigger pipe */
if (0 <= trigpipe) {
close(trigpipe);
@ -694,7 +699,11 @@ static void terminated(int trigpipe, short event, void *arg)
/* print out node name */
orte_node_t *node = procs[i]->node;
if (NULL != node && NULL != node->name) {
fprintf(stderr, "\t%s\n", node->name);
if (NULL != procs[i]->rml_uri) {
fprintf(stderr, "\t%s\n", node->name);
} else {
fprintf(stderr, "\t%s - daemon did not report back when launched\n", node->name);
}
}
}
}
@ -873,7 +882,6 @@ static void timeout_callback(int fd, short ign, void *arg)
static void abort_exit_callback(int fd, short ign, void *arg)
{
int ret;
opal_event_t *event;
if (!orterun_globals.quiet){
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
@ -891,20 +899,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
* after jdata has been OBJ_NEW'd
*/
if (jdata->jobid != ORTE_JOBID_INVALID) {
/* give ourselves a time limit on how long to wait
* for the job to die, just in case we can't make it go
* away for some reason. Don't send us directly back
* to job_completed, though, as that function expects
* to be triggered via orte_wakeup - we could get into
* race conditions, and the timeout won't provide
* that function with the orte_exit pipe fd so it can
* be closed
*/
ORTE_DETECT_TIMEOUT(&event, jdata->num_procs,
orte_timeout_usec_per_proc,
orte_max_timeout,
timeout_callback);
/* terminate the job - this will wake us up and
* call the "terminated" function so we clean up
* and exit
@ -917,6 +911,20 @@ static void abort_exit_callback(int fd, short ign, void *arg)
ORTE_UPDATE_EXIT_STATUS(ret);
orte_wakeup();
}
/* give ourselves a time limit on how long to wait
* for the job to die, just in case we can't make it go
* away for some reason. Don't send us directly back
* to job_completed, though, as that function expects
* to be triggered via orte_wakeup - we could get into
* race conditions, and the timeout won't provide
* that function with the orte_exit pipe fd so it can
* be closed
*/
ORTE_DETECT_TIMEOUT(&abort_exit_event, jdata->num_procs,
orte_timeout_usec_per_proc,
orte_max_timeout,
timeout_callback);
} else {
/* if the jobid is invalid, then we didn't get to
* the point of setting the job up, so there is nothing