Remove the distinction between local and global restarts - leave it up to the error strategy to decide which to do.
Cleanup the heartbeat handling so it is associated with the proc, not a node. Cleanup handling of recovery options so that defaults do not override user values iff they are provided. This commit was SVN r24382.
Этот коммит содержится в:
родитель
172ad649e1
Коммит
a9dca25ca5
@ -583,9 +583,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
if( jdata->enable_recovery ) {
|
||||
/* is this a local proc */
|
||||
if (NULL != (child = proc_is_local(proc))) {
|
||||
/* local proc - see if it has reached its local restart limit */
|
||||
/* local proc - see if it has reached its restart limit */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
|
||||
if (child->restarts < app->max_local_restarts) {
|
||||
if (child->restarts < app->max_restarts) {
|
||||
child->restarts++;
|
||||
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
||||
return ORTE_SUCCESS;
|
||||
@ -594,9 +594,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
* have cleared it
|
||||
*/
|
||||
child->state = state;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* let it fall thru to abort */
|
||||
} else {
|
||||
/* see if we can relocate it somewhere else */
|
||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
|
||||
return ORTE_SUCCESS;
|
||||
@ -1580,10 +1577,10 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
app_name = app->app;
|
||||
/* track that we are attempting to relocate */
|
||||
pdata->relocates++;
|
||||
/* have we exceeded the number of relocates for this proc? */
|
||||
if (app->max_global_restarts < pdata->relocates) {
|
||||
/* track that we are attempting to restart */
|
||||
pdata->restarts++;
|
||||
/* have we exceeded the number of restarts for this proc? */
|
||||
if (app->max_restarts < pdata->restarts) {
|
||||
return ORTE_ERR_RESTART_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
|
@ -310,7 +310,7 @@ static int update_state(orte_jobid_t job,
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if( jobdat->enable_recovery && child->restarts < app->max_local_restarts ) {
|
||||
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted restarting proc %s for the %d time",
|
||||
@ -340,8 +340,8 @@ static int update_state(orte_jobid_t job,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s CHECKING RESTARTS %d VS MAX %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
child->restarts, app->max_local_restarts));
|
||||
if (child->restarts < app->max_local_restarts ) {
|
||||
child->restarts, app->max_restarts));
|
||||
if (child->restarts < app->max_restarts ) {
|
||||
/* attempt to restart it locally */
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
|
@ -72,9 +72,9 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdatorted;
|
||||
orte_app_context_t *app;
|
||||
int rc, tmp;
|
||||
int rc;
|
||||
int32_t ljob;
|
||||
orte_app_idx_t i;
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:setup_job for job %s",
|
||||
@ -93,34 +93,18 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* see if recovery was set in the app */
|
||||
for (i=0; i < jdata->num_apps; i++) {
|
||||
/* if job recovery is not defined, set it to default */
|
||||
if (!jdata->recovery_defined) {
|
||||
/* set to system default */
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
/* if app recovery is not defined, set apps to defaults */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
/* big problem! */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
continue;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) {
|
||||
jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp);
|
||||
} else {
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) {
|
||||
app->max_global_restarts = tmp;
|
||||
} else {
|
||||
app->max_global_restarts = orte_max_global_restarts;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) {
|
||||
app->max_local_restarts = tmp;
|
||||
} else {
|
||||
app->max_local_restarts = orte_max_local_restarts;
|
||||
|
||||
}
|
||||
/* consistency check */
|
||||
if (app->max_global_restarts > 0 ||
|
||||
app->max_local_restarts > 0) {
|
||||
jdata->enable_recovery = true;
|
||||
|
||||
if (!app->recovery_defined) {
|
||||
app->max_restarts = orte_max_restarts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -86,7 +86,7 @@ static void rml_callback_fn(int status,
|
||||
/* local globals */
|
||||
static opal_event_t *send_ev = NULL, *check_ev = NULL;
|
||||
static struct timeval send_time, check_time;
|
||||
static double timeout;
|
||||
static orte_job_t *daemons;
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
static inline double gettime(void) __opal_attribute_always_inline__;
|
||||
@ -106,12 +106,18 @@ static inline double gettime(void)
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
int rc;
|
||||
int rc=ORTE_SUCCESS;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s initializing heartbeat recvs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* get the daemon job object */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
/* can't run */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
/* setup multicast recv for heartbeats */
|
||||
@ -133,7 +139,7 @@ static int init(void)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -142,10 +148,12 @@ static void finalize(void)
|
||||
if (NULL != send_ev) {
|
||||
opal_event_del(send_ev);
|
||||
free(send_ev);
|
||||
send_ev = NULL;
|
||||
}
|
||||
if (NULL != check_ev) {
|
||||
opal_event_del(check_ev);
|
||||
free(check_ev);
|
||||
check_ev = NULL;
|
||||
}
|
||||
|
||||
#if ORTE_ENABLE_MULTICAST
|
||||
@ -156,46 +164,67 @@ static void finalize(void)
|
||||
return;
|
||||
}
|
||||
|
||||
static void setup_time(char *input, struct timeval *time)
|
||||
{
|
||||
char **val;
|
||||
|
||||
/* set default */
|
||||
time->tv_sec = 0;
|
||||
time->tv_usec = 0;
|
||||
|
||||
/* convert the rate to time */
|
||||
val = opal_argv_split(input, ':');
|
||||
if (NULL == val) {
|
||||
/* nothing to do */
|
||||
return;
|
||||
}
|
||||
if (NULL != val[0]) {
|
||||
time->tv_sec = strtol(val[0], NULL, 10);
|
||||
}
|
||||
if (NULL != val[1]) {
|
||||
time->tv_usec = strtol(val[1], NULL, 10);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Start sending and checking heartbeats
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
uint64_t time;
|
||||
|
||||
if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
|
||||
/* heartbeats are only for daemons and HNPs */
|
||||
/* convert the send rate */
|
||||
setup_time(mca_sensor_heartbeat_component.rate, &send_time);
|
||||
if (0 == send_time.tv_sec &&
|
||||
0 == send_time.tv_usec) {
|
||||
/* nothing to do */
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if (!ORTE_PROC_IS_DAEMON) {
|
||||
/* convert the check rate */
|
||||
setup_time(mca_sensor_heartbeat_component.check, &check_time);
|
||||
if (0 == check_time.tv_sec &&
|
||||
0 == check_time.tv_usec) {
|
||||
/* no sense in running if we won't check */
|
||||
return;
|
||||
}
|
||||
|
||||
/* setup the check */
|
||||
check_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev);
|
||||
opal_event_evtimer_add(check_ev, &check_time);
|
||||
}
|
||||
|
||||
/* setup the send */
|
||||
time = mca_sensor_heartbeat_component.rate * 1000; /* convert to microsecs */
|
||||
send_ev = (opal_event_t *) malloc(sizeof(opal_event_t));
|
||||
send_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_event_evtimer_set(opal_event_base, send_ev, send_heartbeat, send_ev);
|
||||
send_time.tv_sec = time / 1000000;
|
||||
send_time.tv_usec = time % 1000000;
|
||||
opal_event_evtimer_add(send_ev, &send_time);
|
||||
|
||||
/* define the timeout */
|
||||
timeout = 2.0 * (double)time;
|
||||
|
||||
/* setup the check */
|
||||
time = mca_sensor_heartbeat_component.check * 1000; /* convert to microsecs */
|
||||
check_ev = (opal_event_t *) malloc(sizeof(opal_event_t));
|
||||
opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev);
|
||||
check_time.tv_sec = time / 1000000;
|
||||
check_time.tv_usec = time % 1000000;
|
||||
opal_event_evtimer_add(check_ev, &check_time);
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
|
||||
/* heartbeats are only for daemons and HNPs */
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != send_ev) {
|
||||
opal_event_del(send_ev);
|
||||
free(send_ev);
|
||||
@ -217,7 +246,7 @@ static void send_heartbeat(int fd, short event, void *arg)
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
return;
|
||||
goto reset;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
@ -233,7 +262,7 @@ static void send_heartbeat(int fd, short event, void *arg)
|
||||
rmcast_callback_fn, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
goto reset;
|
||||
}
|
||||
#else
|
||||
/* send heartbeat to HNP */
|
||||
@ -242,10 +271,11 @@ static void send_heartbeat(int fd, short event, void *arg)
|
||||
rml_callback_fn, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
goto reset;
|
||||
}
|
||||
#endif
|
||||
|
||||
reset:
|
||||
/* reset the timer */
|
||||
opal_event_evtimer_add(tmp, &send_time);
|
||||
}
|
||||
@ -257,22 +287,26 @@ static void send_heartbeat(int fd, short event, void *arg)
|
||||
static void check_heartbeat(int fd, short dummy, void *arg)
|
||||
{
|
||||
int v;
|
||||
orte_nid_t *nid;
|
||||
double now;
|
||||
orte_proc_t *proc;
|
||||
time_t now;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
orte_process_name_t name;
|
||||
|
||||
double delta;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sensor:check_heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
return;
|
||||
goto reset;
|
||||
}
|
||||
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
/* compute a send time interval */
|
||||
delta = send_time.tv_sec + (double)send_time.tv_usec/1000000.0;
|
||||
|
||||
/* get current time */
|
||||
now = gettime();
|
||||
|
||||
@ -280,26 +314,50 @@ static void check_heartbeat(int fd, short dummy, void *arg)
|
||||
* in case multiple daemons are late so all of those that did
|
||||
* can be appropriately flagged
|
||||
*/
|
||||
for (v=0; v < orte_nidmap.size; v++) {
|
||||
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, v))) {
|
||||
for (v=0; v < daemons->procs->size; v++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == nid->beat) {
|
||||
/* ignore myself */
|
||||
if ((int)ORTE_PROC_MY_NAME->vpid == v) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s CHECKING HEARTBEAT FOR %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
|
||||
if (0 == proc->beat) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s NO BEAT YET",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* haven't recvd a beat yet */
|
||||
continue;
|
||||
}
|
||||
if ((now - nid->beat) > timeout) {
|
||||
nid->missed++;
|
||||
if (mca_sensor_heartbeat_component.missed < nid->missed) {
|
||||
/* heartbeat failed */
|
||||
name.vpid = v;
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED,
|
||||
&name, ORTE_PROC_STATE_HEARTBEAT_FAILED,
|
||||
0, ORTE_ERR_HEARTBEAT_LOST);
|
||||
}
|
||||
|
||||
/* compute number of heartbeats missed */
|
||||
proc->missed = (int)((double)(now - proc->beat) / delta);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s MISSING %d BEATS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->missed));
|
||||
if (mca_sensor_heartbeat_component.missed < proc->missed) {
|
||||
/* heartbeat failed */
|
||||
name.vpid = v;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s sensor:check_heartbeat FAILED for daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED,
|
||||
&name, ORTE_PROC_STATE_HEARTBEAT_FAILED,
|
||||
0, ORTE_ERR_HEARTBEAT_LOST);
|
||||
/* zero the last beat to indicate we are waiting to recv
|
||||
* the first beat from the restarted daemon
|
||||
*/
|
||||
proc->beat = 0;
|
||||
}
|
||||
}
|
||||
|
||||
reset:
|
||||
/* reset the timer */
|
||||
opal_event_evtimer_add(tmp, &check_time);
|
||||
}
|
||||
@ -312,7 +370,7 @@ static void recv_rmcast_beats(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buf, void* cbdata)
|
||||
{
|
||||
orte_nid_t *nid;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
@ -324,15 +382,13 @@ static void recv_rmcast_beats(int status,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
|
||||
/* get this daemon's nid - if it isn't here, just ignore
|
||||
* as this is caused by a race condition at startup
|
||||
*/
|
||||
if (NULL != (nid = orte_util_lookup_nid(sender))) {
|
||||
/* get this daemon's object */
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s updating beat time for %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
nid->beat = gettime();
|
||||
proc->beat = gettime();
|
||||
}
|
||||
}
|
||||
|
||||
@ -351,11 +407,11 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_nid_t *nid;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_intialized) {
|
||||
return;
|
||||
goto reset;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
@ -363,22 +419,16 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
|
||||
/* get this daemon's nid - if it isn't here, just ignore
|
||||
* as this is caused by a race condition at startup
|
||||
*/
|
||||
if (NULL != (nid = orte_util_lookup_nid(sender))) {
|
||||
/* get this daemon's object */
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s updating beat time for %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
nid->beat = gettime();
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
|
||||
"%s no nidmap entry for %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
proc->beat = gettime();
|
||||
}
|
||||
|
||||
reset:
|
||||
/* reissue the recv */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
|
@ -23,8 +23,8 @@ BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_heartbeat_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int rate;
|
||||
int check;
|
||||
char *rate;
|
||||
char *check;
|
||||
int missed;
|
||||
};
|
||||
typedef struct orte_sensor_heartbeat_component_t orte_sensor_heartbeat_component_t;
|
||||
|
@ -58,19 +58,17 @@ static int orte_sensor_heartbeat_open(void)
|
||||
int tmp;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "rate",
|
||||
"Heartbeat rate in milliseconds (default=0)",
|
||||
false, false, 0, &tmp);
|
||||
mca_sensor_heartbeat_component.rate = tmp;
|
||||
mca_base_param_reg_string(c, "rate",
|
||||
"Heartbeat rate in sec (default=0:0)",
|
||||
false, false, "0:0", &mca_sensor_heartbeat_component.rate);
|
||||
|
||||
mca_base_param_reg_int(c, "check",
|
||||
"Check for failure rate in milliseconds (default=500)",
|
||||
false, false, 500, &tmp);
|
||||
mca_sensor_heartbeat_component.check = tmp;
|
||||
mca_base_param_reg_string(c, "check",
|
||||
"Check for failure rate in sec:usec (default=1:0)",
|
||||
false, false, "1:0", &mca_sensor_heartbeat_component.check);
|
||||
|
||||
mca_base_param_reg_int(c, "missed",
|
||||
"Number of missed heartbeats before failure is declared (default=5)",
|
||||
false, false, 5, &tmp);
|
||||
"Number of missed heartbeat checks before failure is declared (default=2)",
|
||||
false, false, 2, &tmp);
|
||||
mca_sensor_heartbeat_component.missed = tmp;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -79,18 +77,9 @@ static int orte_sensor_heartbeat_open(void)
|
||||
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* only usable by daemons and HNPs */
|
||||
if (0 < mca_sensor_heartbeat_component.rate &&
|
||||
(ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP)) {
|
||||
*priority = 10; /* use if we were built */
|
||||
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* otherwise, we are not available */
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
*priority = 10; /* use if we were built */
|
||||
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -284,6 +284,13 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the recovery policy defined flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->recovery_defined)), 1, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the recovery flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->enable_recovery)), 1, OPAL_BOOL))) {
|
||||
@ -477,13 +484,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of relocates */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)&(procs[i]->relocates), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* pack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
@ -749,21 +749,9 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
|
||||
}
|
||||
}
|
||||
|
||||
/* pack the restart limits */
|
||||
/* pack the restart limit */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->max_local_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->max_global_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the constrain flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->constrain)), 1, OPAL_BOOL))) {
|
||||
(void*)(&(app_context[i]->max_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -498,8 +498,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tRelocates: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
|
||||
orte_proc_state_to_str(src->state), src->restarts, src->relocates, (long)src->app_idx,
|
||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
|
||||
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
|
||||
(NULL == src->slot_list) ? "NULL" : src->slot_list);
|
||||
free(tmp);
|
||||
|
||||
@ -528,10 +528,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d\tConstrain: %s",
|
||||
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Restarts: %d",
|
||||
pfx2, (NULL == src->name) ? "NULL" : src->name, (unsigned long)src->idx, (NULL == src->app) ? "NULL" : src->app,
|
||||
pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts,
|
||||
src->constrain ? "TRUE" : "FALSE");
|
||||
pfx2, (unsigned long)src->num_procs, src->max_restarts);
|
||||
|
||||
count = opal_argv_count(src->argv);
|
||||
for (i=0; i < count; i++) {
|
||||
|
@ -297,6 +297,14 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the recovery policy defined flag */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->recovery_defined)), &n, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the recovery flag */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -520,14 +528,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the number of relocates */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(procs[i]->relocates)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* unpack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -825,27 +825,13 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
|
||||
app_context[i]->preload_files_src_dir = NULL;
|
||||
}
|
||||
|
||||
/* unpack the restart limits */
|
||||
/* unpack the restart limit */
|
||||
max_n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_local_restarts,
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_restarts,
|
||||
&max_n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
max_n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_global_restarts,
|
||||
&max_n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the constrain flag */
|
||||
max_n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->constrain,
|
||||
&max_n, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* Unpack the sstore_load */
|
||||
|
@ -173,8 +173,7 @@ bool orte_do_not_barrier = false;
|
||||
|
||||
/* process recovery */
|
||||
bool orte_enable_recovery;
|
||||
int32_t orte_max_global_restarts;
|
||||
int32_t orte_max_local_restarts;
|
||||
int32_t orte_max_restarts;
|
||||
|
||||
/* comm fn for updating state */
|
||||
orte_default_comm_fn_t orte_comm;
|
||||
@ -183,9 +182,6 @@ orte_default_comm_fn_t orte_comm;
|
||||
bool orte_report_child_jobs_separately;
|
||||
struct timeval orte_child_time_to_exit;
|
||||
|
||||
/* orte progress threads */
|
||||
bool orte_progress_threads_enabled;
|
||||
|
||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||
|
||||
int orte_debug_output = -1;
|
||||
@ -567,9 +563,8 @@ static void orte_app_context_construct(orte_app_context_t* app_context)
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
app_context->sstore_load = NULL;
|
||||
#endif
|
||||
app_context->max_local_restarts = -1;
|
||||
app_context->max_global_restarts = -1;
|
||||
app_context->constrain = true;
|
||||
app_context->recovery_defined = false;
|
||||
app_context->max_restarts = -1000;
|
||||
}
|
||||
|
||||
static void orte_app_context_destructor(orte_app_context_t* app_context)
|
||||
@ -692,6 +687,7 @@ static void orte_job_construct(orte_job_t* job)
|
||||
OBJ_CONSTRUCT(&job->dyn_spawn_cond, opal_condition_t);
|
||||
job->dyn_spawn_active = false;
|
||||
|
||||
job->recovery_defined = false;
|
||||
job->enable_recovery = false;
|
||||
|
||||
job->launch_msg_sent.tv_sec = 0;
|
||||
@ -892,7 +888,10 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->nodename = NULL;
|
||||
proc->rml_uri = NULL;
|
||||
proc->restarts = 0;
|
||||
proc->relocates = 0;
|
||||
#if ORTE_ENABLE_HEARTBEAT
|
||||
proc->beat = 0;
|
||||
proc->missed = 0;
|
||||
#endif
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
proc->ckpt_state = 0;
|
||||
proc->ckpt_snapshot_ref = NULL;
|
||||
@ -971,10 +970,6 @@ static void orte_nid_construct(orte_nid_t *ptr)
|
||||
ptr->oversubscribed = false;
|
||||
OBJ_CONSTRUCT(&ptr->attrs, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t);
|
||||
#if ORTE_ENABLE_HEARTBEAT
|
||||
ptr->beat = 0;
|
||||
ptr->missed = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void orte_nid_destruct(orte_nid_t *ptr)
|
||||
|
@ -227,15 +227,10 @@ typedef struct {
|
||||
/** What files SStore should load before local launch, if any */
|
||||
char *sstore_load;
|
||||
#endif
|
||||
/* max number of times a process can be restarted locally */
|
||||
int32_t max_local_restarts;
|
||||
/* max number of times a process can be relocated to another node */
|
||||
int32_t max_global_restarts;
|
||||
/* whether or not the procs in this app are constrained to stay
|
||||
* on the specified nodes when restarted, or can move to any
|
||||
* known node
|
||||
*/
|
||||
bool constrain;
|
||||
/* recovery policy has been defined */
|
||||
bool recovery_defined;
|
||||
/* max number of times a process can be restarted */
|
||||
int32_t max_restarts;
|
||||
} orte_app_context_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
|
||||
@ -423,6 +418,8 @@ typedef struct {
|
||||
bool abort;
|
||||
/* proc that caused that to happen */
|
||||
struct orte_proc_t *aborted_proc;
|
||||
/* recovery policy has been defined */
|
||||
bool recovery_defined;
|
||||
/* enable recovery of these processes */
|
||||
bool enable_recovery;
|
||||
/* time launch message was sent */
|
||||
@ -485,8 +482,12 @@ struct orte_proc_t {
|
||||
char *rml_uri;
|
||||
/* number of times this process has been restarted */
|
||||
int32_t restarts;
|
||||
/* number of times this process has been relocated */
|
||||
int32_t relocates;
|
||||
#if ORTE_ENABLE_HEARTBEAT
|
||||
/* time when last heartbeat was detected */
|
||||
double beat;
|
||||
/* number of missed heartbeats */
|
||||
int missed;
|
||||
#endif
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* ckpt state */
|
||||
size_t ckpt_state;
|
||||
@ -522,12 +523,6 @@ typedef struct {
|
||||
opal_list_t attrs;
|
||||
/* list of system info */
|
||||
opal_list_t sysinfo;
|
||||
#if ORTE_ENABLE_HEARTBEAT
|
||||
/* seconds when last heartbeat was detected */
|
||||
double beat;
|
||||
/* number of missed heartbeats */
|
||||
int missed;
|
||||
#endif
|
||||
} orte_nid_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
|
||||
|
||||
@ -704,8 +699,7 @@ ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
||||
|
||||
/* process recovery */
|
||||
ORTE_DECLSPEC extern bool orte_enable_recovery;
|
||||
ORTE_DECLSPEC extern int32_t orte_max_global_restarts;
|
||||
ORTE_DECLSPEC extern int32_t orte_max_local_restarts;
|
||||
ORTE_DECLSPEC extern int32_t orte_max_restarts;
|
||||
|
||||
/* comm interface */
|
||||
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
|
||||
@ -724,9 +718,6 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
|
||||
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
|
||||
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
|
||||
|
||||
/* orte progress threads */
|
||||
ORTE_DECLSPEC extern bool orte_progress_threads_enabled;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -458,46 +458,38 @@ int orte_register_params(void)
|
||||
(int)false, &value);
|
||||
orte_enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "max_global_restarts",
|
||||
"Max number of times to relocate a failed process to a new node",
|
||||
mca_base_param_reg_int_name("orte", "max_restarts",
|
||||
"Max number of times to restart a failed process",
|
||||
false, false,
|
||||
-1, &orte_max_global_restarts);
|
||||
-1, &orte_max_restarts);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "max_local_restarts",
|
||||
"Max number of times to locally restart a failed process before relocating it to a new node",
|
||||
false, false,
|
||||
-1, &orte_max_local_restarts);
|
||||
if (orte_enable_recovery) {
|
||||
if (orte_max_global_restarts <= 0 &&
|
||||
orte_max_local_restarts <= 0) {
|
||||
if (orte_max_restarts <= 0) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
opal_output(orte_clean_output,
|
||||
"------------------------------------------------------------\n"
|
||||
"Although the MCA param orte_enable_recovery was set to true,\n"
|
||||
"values for the max number of restarts was not provided:\n\n"
|
||||
"Max global restarts: %d\n"
|
||||
"Max local restarts: %d\n\n"
|
||||
"At least one of these must be a positive value. We are disabling\n"
|
||||
"a value for the max number of restarts was not provided:\n\n"
|
||||
"Max restarts: %d\n"
|
||||
"This must be a positive value. We are disabling\n"
|
||||
"process recovery, but continuing execution.\n"
|
||||
"------------------------------------------------------------",
|
||||
orte_max_global_restarts, orte_max_local_restarts);
|
||||
orte_max_restarts);
|
||||
}
|
||||
orte_enable_recovery = false;
|
||||
}
|
||||
} else if (orte_max_global_restarts > 0 ||
|
||||
orte_max_local_restarts > 0) {
|
||||
} else if (orte_max_restarts > 0) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
opal_output(orte_clean_output,
|
||||
"------------------------------------------------------------------\n"
|
||||
"The MCA param orte_enable_recovery was not set to true, but\n"
|
||||
"positive value(s) were provided for the number of restarts:\n\n"
|
||||
"Max global restarts: %d\n"
|
||||
"Max local restarts: %d\n\n"
|
||||
"a positive value was provided for the number of restarts:\n\n"
|
||||
"Max restarts: %d\n"
|
||||
"We are enabling process recovery and continuing execution. To avoid\n"
|
||||
"this warning in the future, please set the orte_enable_recovery\n"
|
||||
"param to non-zero.\n"
|
||||
"------------------------------------------------------------------",
|
||||
orte_max_global_restarts, orte_max_local_restarts);
|
||||
orte_max_restarts);
|
||||
}
|
||||
orte_enable_recovery = true;
|
||||
}
|
||||
@ -514,17 +506,6 @@ int orte_register_params(void)
|
||||
INT_MAX, &value);
|
||||
orte_child_time_to_exit.tv_sec = value;
|
||||
orte_child_time_to_exit.tv_usec = 0;
|
||||
|
||||
mca_base_param_reg_int_name("orte", "enable_progress_threads",
|
||||
"Enable the use of ORTE progress threads in applications",
|
||||
false, false,
|
||||
(int)false, &value);
|
||||
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
orte_progress_threads_enabled = true;
|
||||
} else {
|
||||
orte_progress_threads_enabled = OPAL_INT_TO_BOOL(value);
|
||||
}
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
|
@ -425,13 +425,9 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable recovery from process failure [Default = disabled]" },
|
||||
|
||||
{ "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1,
|
||||
{ "orte", "max", "restarts", '\0', "max-restarts", "max-restarts", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Max number of times to relocate a failed process to a new node" },
|
||||
|
||||
{ "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Max number of times to locally restart a failed process before relocating it to a new node" },
|
||||
"Max number of times to restart a failed process" },
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
{ "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0,
|
||||
@ -955,8 +951,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
/* if recovery was disabled on the cmd line, do so */
|
||||
if (orterun_globals.disable_recovery) {
|
||||
orte_enable_recovery = false;
|
||||
orte_max_local_restarts = 0;
|
||||
orte_max_global_restarts = 0;
|
||||
orte_max_restarts = 0;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user