1
1

Remove the distinction between local and global restarts - leave it up to the error strategy to decide which to do.

Cleanup the heartbeat handling so it is associated with the proc, not a node.

Cleanup handling of recovery options so that defaults do not override user values iff they are provided.

This commit was SVN r24382.
Этот коммит содержится в:
Ralph Castain 2011-02-14 20:49:12 +00:00
родитель 172ad649e1
Коммит a9dca25ca5
13 изменённых файлов: 207 добавлений и 252 удалений

Просмотреть файл

@ -583,9 +583,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
if( jdata->enable_recovery ) {
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its local restart limit */
/* local proc - see if it has reached its restart limit */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
if (child->restarts < app->max_local_restarts) {
if (child->restarts < app->max_restarts) {
child->restarts++;
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS;
@ -594,9 +594,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
* have cleared it
*/
child->state = state;
ORTE_ERROR_LOG(rc);
/* let it fall thru to abort */
} else {
/* see if we can relocate it somewhere else */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
return ORTE_SUCCESS;
@ -1580,10 +1577,10 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
return ORTE_ERR_NOT_FOUND;
}
app_name = app->app;
/* track that we are attempting to relocate */
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
if (app->max_global_restarts < pdata->relocates) {
/* track that we are attempting to restart */
pdata->restarts++;
/* have we exceeded the number of restarts for this proc? */
if (app->max_restarts < pdata->restarts) {
return ORTE_ERR_RESTART_LIMIT_EXCEEDED;
}

Просмотреть файл

@ -310,7 +310,7 @@ static int update_state(orte_jobid_t job,
killprocs(proc->jobid, proc->vpid);
}
app = jobdat->apps[child->app_idx];
if( jobdat->enable_recovery && child->restarts < app->max_local_restarts ) {
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
@ -340,8 +340,8 @@ static int update_state(orte_jobid_t job,
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s CHECKING RESTARTS %d VS MAX %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
child->restarts, app->max_local_restarts));
if (child->restarts < app->max_local_restarts ) {
child->restarts, app->max_restarts));
if (child->restarts < app->max_restarts ) {
/* attempt to restart it locally */
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,

Просмотреть файл

@ -72,9 +72,9 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
{
orte_job_t *jdatorted;
orte_app_context_t *app;
int rc, tmp;
int rc;
int32_t ljob;
orte_app_idx_t i;
int i;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_job for job %s",
@ -93,34 +93,18 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* see if recovery was set in the app */
for (i=0; i < jdata->num_apps; i++) {
/* if job recovery is not defined, set it to default */
if (!jdata->recovery_defined) {
/* set to system default */
jdata->enable_recovery = orte_enable_recovery;
}
/* if app recovery is not defined, set apps to defaults */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
/* big problem! */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
continue;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) {
jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp);
} else {
jdata->enable_recovery = orte_enable_recovery;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) {
app->max_global_restarts = tmp;
} else {
app->max_global_restarts = orte_max_global_restarts;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) {
app->max_local_restarts = tmp;
} else {
app->max_local_restarts = orte_max_local_restarts;
}
/* consistency check */
if (app->max_global_restarts > 0 ||
app->max_local_restarts > 0) {
jdata->enable_recovery = true;
if (!app->recovery_defined) {
app->max_restarts = orte_max_restarts;
}
}
}

Просмотреть файл

@ -86,7 +86,7 @@ static void rml_callback_fn(int status,
/* local globals */
static opal_event_t *send_ev = NULL, *check_ev = NULL;
static struct timeval send_time, check_time;
static double timeout;
static orte_job_t *daemons;
#include MCA_timer_IMPLEMENTATION_HEADER
static inline double gettime(void) __opal_attribute_always_inline__;
@ -106,12 +106,18 @@ static inline double gettime(void)
static int init(void)
{
int rc;
int rc=ORTE_SUCCESS;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s initializing heartbeat recvs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* get the daemon job object */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
/* can't run */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
#if ORTE_ENABLE_MULTICAST
/* setup multicast recv for heartbeats */
@ -133,7 +139,7 @@ static int init(void)
}
}
#endif
return rc;
}
@ -142,10 +148,12 @@ static void finalize(void)
if (NULL != send_ev) {
opal_event_del(send_ev);
free(send_ev);
send_ev = NULL;
}
if (NULL != check_ev) {
opal_event_del(check_ev);
free(check_ev);
check_ev = NULL;
}
#if ORTE_ENABLE_MULTICAST
@ -156,46 +164,67 @@ static void finalize(void)
return;
}
static void setup_time(char *input, struct timeval *time)
{
char **val;
/* set default */
time->tv_sec = 0;
time->tv_usec = 0;
/* convert the rate to time */
val = opal_argv_split(input, ':');
if (NULL == val) {
/* nothing to do */
return;
}
if (NULL != val[0]) {
time->tv_sec = strtol(val[0], NULL, 10);
}
if (NULL != val[1]) {
time->tv_usec = strtol(val[1], NULL, 10);
}
}
/*
* Start sending and checking heartbeats
*/
static void start(orte_jobid_t jobid)
{
uint64_t time;
if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
/* heartbeats are only for daemons and HNPs */
/* convert the send rate */
setup_time(mca_sensor_heartbeat_component.rate, &send_time);
if (0 == send_time.tv_sec &&
0 == send_time.tv_usec) {
/* nothing to do */
return;
}
if (!ORTE_PROC_IS_DAEMON) {
/* convert the check rate */
setup_time(mca_sensor_heartbeat_component.check, &check_time);
if (0 == check_time.tv_sec &&
0 == check_time.tv_usec) {
/* no sense in running if we won't check */
return;
}
/* setup the check */
check_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev);
opal_event_evtimer_add(check_ev, &check_time);
}
/* setup the send */
time = mca_sensor_heartbeat_component.rate * 1000; /* convert to microsecs */
send_ev = (opal_event_t *) malloc(sizeof(opal_event_t));
send_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_event_evtimer_set(opal_event_base, send_ev, send_heartbeat, send_ev);
send_time.tv_sec = time / 1000000;
send_time.tv_usec = time % 1000000;
opal_event_evtimer_add(send_ev, &send_time);
/* define the timeout */
timeout = 2.0 * (double)time;
/* setup the check */
time = mca_sensor_heartbeat_component.check * 1000; /* convert to microsecs */
check_ev = (opal_event_t *) malloc(sizeof(opal_event_t));
opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev);
check_time.tv_sec = time / 1000000;
check_time.tv_usec = time % 1000000;
opal_event_evtimer_add(check_ev, &check_time);
}
static void stop(orte_jobid_t jobid)
{
if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
/* heartbeats are only for daemons and HNPs */
return;
}
if (NULL != send_ev) {
opal_event_del(send_ev);
free(send_ev);
@ -217,7 +246,7 @@ static void send_heartbeat(int fd, short event, void *arg)
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
goto reset;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
@ -233,7 +262,7 @@ static void send_heartbeat(int fd, short event, void *arg)
rmcast_callback_fn, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
goto reset;
}
#else
/* send heartbeat to HNP */
@ -242,10 +271,11 @@ static void send_heartbeat(int fd, short event, void *arg)
rml_callback_fn, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
goto reset;
}
#endif
reset:
/* reset the timer */
opal_event_evtimer_add(tmp, &send_time);
}
@ -257,22 +287,26 @@ static void send_heartbeat(int fd, short event, void *arg)
static void check_heartbeat(int fd, short dummy, void *arg)
{
int v;
orte_nid_t *nid;
double now;
orte_proc_t *proc;
time_t now;
opal_event_t *tmp = (opal_event_t*)arg;
orte_process_name_t name;
double delta;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sensor:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
goto reset;
}
name.jobid = ORTE_PROC_MY_NAME->jobid;
/* compute a send time interval */
delta = send_time.tv_sec + (double)send_time.tv_usec/1000000.0;
/* get current time */
now = gettime();
@ -280,26 +314,50 @@ static void check_heartbeat(int fd, short dummy, void *arg)
* in case multiple daemons are late so all of those that did
* can be appropriately flagged
*/
for (v=0; v < orte_nidmap.size; v++) {
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, v))) {
for (v=0; v < daemons->procs->size; v++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
continue;
}
if (0 == nid->beat) {
/* ignore myself */
if ((int)ORTE_PROC_MY_NAME->vpid == v) {
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s CHECKING HEARTBEAT FOR %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (0 == proc->beat) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s NO BEAT YET",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* haven't recvd a beat yet */
continue;
}
if ((now - nid->beat) > timeout) {
nid->missed++;
if (mca_sensor_heartbeat_component.missed < nid->missed) {
/* heartbeat failed */
name.vpid = v;
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED,
&name, ORTE_PROC_STATE_HEARTBEAT_FAILED,
0, ORTE_ERR_HEARTBEAT_LOST);
}
/* compute number of heartbeats missed */
proc->missed = (int)((double)(now - proc->beat) / delta);
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s MISSING %d BEATS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->missed));
if (mca_sensor_heartbeat_component.missed < proc->missed) {
/* heartbeat failed */
name.vpid = v;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s sensor:check_heartbeat FAILED for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name)));
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED,
&name, ORTE_PROC_STATE_HEARTBEAT_FAILED,
0, ORTE_ERR_HEARTBEAT_LOST);
/* zero the last beat to indicate we are waiting to recv
* the first beat from the restarted daemon
*/
proc->beat = 0;
}
}
reset:
/* reset the timer */
opal_event_evtimer_add(tmp, &check_time);
}
@ -312,7 +370,7 @@ static void recv_rmcast_beats(int status,
orte_process_name_t *sender,
opal_buffer_t *buf, void* cbdata)
{
orte_nid_t *nid;
orte_proc_t *proc;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
@ -324,15 +382,13 @@ static void recv_rmcast_beats(int status,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* get this daemon's nid - if it isn't here, just ignore
* as this is caused by a race condition at startup
*/
if (NULL != (nid = orte_util_lookup_nid(sender))) {
/* get this daemon's object */
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s updating beat time for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
nid->beat = gettime();
proc->beat = gettime();
}
}
@ -351,11 +407,11 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_nid_t *nid;
orte_proc_t *proc;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_intialized) {
return;
goto reset;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
@ -363,22 +419,16 @@ static void recv_rml_beats(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* get this daemon's nid - if it isn't here, just ignore
* as this is caused by a race condition at startup
*/
if (NULL != (nid = orte_util_lookup_nid(sender))) {
/* get this daemon's object */
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s updating beat time for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
nid->beat = gettime();
} else {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
"%s no nidmap entry for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
proc->beat = gettime();
}
reset:
/* reissue the recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT,

Просмотреть файл

@ -23,8 +23,8 @@ BEGIN_C_DECLS
struct orte_sensor_heartbeat_component_t {
orte_sensor_base_component_t super;
int rate;
int check;
char *rate;
char *check;
int missed;
};
typedef struct orte_sensor_heartbeat_component_t orte_sensor_heartbeat_component_t;

Просмотреть файл

@ -58,19 +58,17 @@ static int orte_sensor_heartbeat_open(void)
int tmp;
/* lookup parameters */
mca_base_param_reg_int(c, "rate",
"Heartbeat rate in milliseconds (default=0)",
false, false, 0, &tmp);
mca_sensor_heartbeat_component.rate = tmp;
mca_base_param_reg_string(c, "rate",
"Heartbeat rate in sec (default=0:0)",
false, false, "0:0", &mca_sensor_heartbeat_component.rate);
mca_base_param_reg_int(c, "check",
"Check for failure rate in milliseconds (default=500)",
false, false, 500, &tmp);
mca_sensor_heartbeat_component.check = tmp;
mca_base_param_reg_string(c, "check",
"Check for failure rate in sec:usec (default=1:0)",
false, false, "1:0", &mca_sensor_heartbeat_component.check);
mca_base_param_reg_int(c, "missed",
"Number of missed heartbeats before failure is declared (default=5)",
false, false, 5, &tmp);
"Number of missed heartbeat checks before failure is declared (default=2)",
false, false, 2, &tmp);
mca_sensor_heartbeat_component.missed = tmp;
return ORTE_SUCCESS;
@ -79,18 +77,9 @@ static int orte_sensor_heartbeat_open(void)
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
{
/* only usable by daemons and HNPs */
if (0 < mca_sensor_heartbeat_component.rate &&
(ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP)) {
*priority = 10; /* use if we were built */
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
return ORTE_SUCCESS;
}
/* otherwise, we are not available */
*priority = 0;
*module = NULL;
return ORTE_ERROR;
*priority = 10; /* use if we were built */
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
return ORTE_SUCCESS;
}
/**

Просмотреть файл

@ -284,6 +284,13 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
return rc;
}
/* pack the recovery policy defined flag */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->recovery_defined)), 1, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the recovery flag */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->enable_recovery)), 1, OPAL_BOOL))) {
@ -477,13 +484,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
return rc;
}
/* pack the number of relocates */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)&(procs[i]->relocates), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1
/* pack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
@ -749,21 +749,9 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
}
}
/* pack the restart limits */
/* pack the restart limit */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->max_local_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->max_global_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the constrain flag */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->constrain)), 1, OPAL_BOOL))) {
(void*)(&(app_context[i]->max_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -498,8 +498,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
free(tmp);
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tRelocates: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
orte_proc_state_to_str(src->state), src->restarts, src->relocates, (long)src->app_idx,
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
(NULL == src->slot_list) ? "NULL" : src->slot_list);
free(tmp);
@ -528,10 +528,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
asprintf(&pfx2, "%s", prefix);
}
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d\tConstrain: %s",
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Restarts: %d",
pfx2, (NULL == src->name) ? "NULL" : src->name, (unsigned long)src->idx, (NULL == src->app) ? "NULL" : src->app,
pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts,
src->constrain ? "TRUE" : "FALSE");
pfx2, (unsigned long)src->num_procs, src->max_restarts);
count = opal_argv_count(src->argv);
for (i=0; i < count; i++) {

Просмотреть файл

@ -297,6 +297,14 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
return rc;
}
/* unpack the recovery policy defined flag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->recovery_defined)), &n, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the recovery flag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -520,14 +528,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
return rc;
}
/* unpack the number of relocates */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->relocates)), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1
/* unpack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -825,27 +825,13 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
app_context[i]->preload_files_src_dir = NULL;
}
/* unpack the restart limits */
/* unpack the restart limit */
max_n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_local_restarts,
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_restarts,
&max_n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
max_n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_global_restarts,
&max_n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the constrain flag */
max_n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->constrain,
&max_n, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1
/* Unpack the sstore_load */

Просмотреть файл

@ -173,8 +173,7 @@ bool orte_do_not_barrier = false;
/* process recovery */
bool orte_enable_recovery;
int32_t orte_max_global_restarts;
int32_t orte_max_local_restarts;
int32_t orte_max_restarts;
/* comm fn for updating state */
orte_default_comm_fn_t orte_comm;
@ -183,9 +182,6 @@ orte_default_comm_fn_t orte_comm;
bool orte_report_child_jobs_separately;
struct timeval orte_child_time_to_exit;
/* orte progress threads */
bool orte_progress_threads_enabled;
#endif /* !ORTE_DISABLE_FULL_RTE */
int orte_debug_output = -1;
@ -567,9 +563,8 @@ static void orte_app_context_construct(orte_app_context_t* app_context)
#if OPAL_ENABLE_FT_CR == 1
app_context->sstore_load = NULL;
#endif
app_context->max_local_restarts = -1;
app_context->max_global_restarts = -1;
app_context->constrain = true;
app_context->recovery_defined = false;
app_context->max_restarts = -1000;
}
static void orte_app_context_destructor(orte_app_context_t* app_context)
@ -692,6 +687,7 @@ static void orte_job_construct(orte_job_t* job)
OBJ_CONSTRUCT(&job->dyn_spawn_cond, opal_condition_t);
job->dyn_spawn_active = false;
job->recovery_defined = false;
job->enable_recovery = false;
job->launch_msg_sent.tv_sec = 0;
@ -892,7 +888,10 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->nodename = NULL;
proc->rml_uri = NULL;
proc->restarts = 0;
proc->relocates = 0;
#if ORTE_ENABLE_HEARTBEAT
proc->beat = 0;
proc->missed = 0;
#endif
#if OPAL_ENABLE_FT_CR == 1
proc->ckpt_state = 0;
proc->ckpt_snapshot_ref = NULL;
@ -971,10 +970,6 @@ static void orte_nid_construct(orte_nid_t *ptr)
ptr->oversubscribed = false;
OBJ_CONSTRUCT(&ptr->attrs, opal_list_t);
OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t);
#if ORTE_ENABLE_HEARTBEAT
ptr->beat = 0;
ptr->missed = 0;
#endif
}
static void orte_nid_destruct(orte_nid_t *ptr)

Просмотреть файл

@ -227,15 +227,10 @@ typedef struct {
/** What files SStore should load before local launch, if any */
char *sstore_load;
#endif
/* max number of times a process can be restarted locally */
int32_t max_local_restarts;
/* max number of times a process can be relocated to another node */
int32_t max_global_restarts;
/* whether or not the procs in this app are constrained to stay
* on the specified nodes when restarted, or can move to any
* known node
*/
bool constrain;
/* recovery policy has been defined */
bool recovery_defined;
/* max number of times a process can be restarted */
int32_t max_restarts;
} orte_app_context_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
@ -423,6 +418,8 @@ typedef struct {
bool abort;
/* proc that caused that to happen */
struct orte_proc_t *aborted_proc;
/* recovery policy has been defined */
bool recovery_defined;
/* enable recovery of these processes */
bool enable_recovery;
/* time launch message was sent */
@ -485,8 +482,12 @@ struct orte_proc_t {
char *rml_uri;
/* number of times this process has been restarted */
int32_t restarts;
/* number of times this process has been relocated */
int32_t relocates;
#if ORTE_ENABLE_HEARTBEAT
/* time when last heartbeat was detected */
double beat;
/* number of missed heartbeats */
int missed;
#endif
#if OPAL_ENABLE_FT_CR == 1
/* ckpt state */
size_t ckpt_state;
@ -522,12 +523,6 @@ typedef struct {
opal_list_t attrs;
/* list of system info */
opal_list_t sysinfo;
#if ORTE_ENABLE_HEARTBEAT
/* seconds when last heartbeat was detected */
double beat;
/* number of missed heartbeats */
int missed;
#endif
} orte_nid_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
@ -704,8 +699,7 @@ ORTE_DECLSPEC extern bool orte_do_not_barrier;
/* process recovery */
ORTE_DECLSPEC extern bool orte_enable_recovery;
ORTE_DECLSPEC extern int32_t orte_max_global_restarts;
ORTE_DECLSPEC extern int32_t orte_max_local_restarts;
ORTE_DECLSPEC extern int32_t orte_max_restarts;
/* comm interface */
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
@ -724,9 +718,6 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
/* orte progress threads */
ORTE_DECLSPEC extern bool orte_progress_threads_enabled;
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -458,46 +458,38 @@ int orte_register_params(void)
(int)false, &value);
orte_enable_recovery = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "max_global_restarts",
"Max number of times to relocate a failed process to a new node",
mca_base_param_reg_int_name("orte", "max_restarts",
"Max number of times to restart a failed process",
false, false,
-1, &orte_max_global_restarts);
-1, &orte_max_restarts);
mca_base_param_reg_int_name("orte", "max_local_restarts",
"Max number of times to locally restart a failed process before relocating it to a new node",
false, false,
-1, &orte_max_local_restarts);
if (orte_enable_recovery) {
if (orte_max_global_restarts <= 0 &&
orte_max_local_restarts <= 0) {
if (orte_max_restarts <= 0) {
if (ORTE_PROC_IS_HNP) {
opal_output(orte_clean_output,
"------------------------------------------------------------\n"
"Although the MCA param orte_enable_recovery was set to true,\n"
"values for the max number of restarts was not provided:\n\n"
"Max global restarts: %d\n"
"Max local restarts: %d\n\n"
"At least one of these must be a positive value. We are disabling\n"
"a value for the max number of restarts was not provided:\n\n"
"Max restarts: %d\n"
"This must be a positive value. We are disabling\n"
"process recovery, but continuing execution.\n"
"------------------------------------------------------------",
orte_max_global_restarts, orte_max_local_restarts);
orte_max_restarts);
}
orte_enable_recovery = false;
}
} else if (orte_max_global_restarts > 0 ||
orte_max_local_restarts > 0) {
} else if (orte_max_restarts > 0) {
if (ORTE_PROC_IS_HNP) {
opal_output(orte_clean_output,
"------------------------------------------------------------------\n"
"The MCA param orte_enable_recovery was not set to true, but\n"
"positive value(s) were provided for the number of restarts:\n\n"
"Max global restarts: %d\n"
"Max local restarts: %d\n\n"
"a positive value was provided for the number of restarts:\n\n"
"Max restarts: %d\n"
"We are enabling process recovery and continuing execution. To avoid\n"
"this warning in the future, please set the orte_enable_recovery\n"
"param to non-zero.\n"
"------------------------------------------------------------------",
orte_max_global_restarts, orte_max_local_restarts);
orte_max_restarts);
}
orte_enable_recovery = true;
}
@ -514,17 +506,6 @@ int orte_register_params(void)
INT_MAX, &value);
orte_child_time_to_exit.tv_sec = value;
orte_child_time_to_exit.tv_usec = 0;
mca_base_param_reg_int_name("orte", "enable_progress_threads",
"Enable the use of ORTE progress threads in applications",
false, false,
(int)false, &value);
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
orte_progress_threads_enabled = true;
} else {
orte_progress_threads_enabled = OPAL_INT_TO_BOOL(value);
}
#endif /* ORTE_DISABLE_FULL_SUPPORT */

Просмотреть файл

@ -425,13 +425,9 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable recovery from process failure [Default = disabled]" },
{ "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1,
{ "orte", "max", "restarts", '\0', "max-restarts", "max-restarts", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to relocate a failed process to a new node" },
{ "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to locally restart a failed process before relocating it to a new node" },
"Max number of times to restart a failed process" },
#if OPAL_ENABLE_CRDEBUG == 1
{ "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0,
@ -955,8 +951,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
/* if recovery was disabled on the cmd line, do so */
if (orterun_globals.disable_recovery) {
orte_enable_recovery = false;
orte_max_local_restarts = 0;
orte_max_global_restarts = 0;
orte_max_restarts = 0;
}
return ORTE_SUCCESS;