From a9dca25ca5a198e18a0307b53ee8336bcad5f2dd Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 14 Feb 2011 20:49:12 +0000 Subject: [PATCH] Remove the distinction between local and global restarts - leave it up to the error strategy to decide which to do. Cleanup the heartbeat handling so it is associated with the proc, not a node. Cleanup handling of recovery options so that defaults do not override user values iff they are provided. This commit was SVN r24382. --- orte/mca/errmgr/hnp/errmgr_hnp.c | 15 +- orte/mca/errmgr/orted/errmgr_orted.c | 6 +- orte/mca/plm/base/plm_base_launch_support.c | 40 ++-- orte/mca/sensor/heartbeat/sensor_heartbeat.c | 178 +++++++++++------- orte/mca/sensor/heartbeat/sensor_heartbeat.h | 4 +- .../heartbeat/sensor_heartbeat_component.c | 33 ++-- .../data_type_support/orte_dt_packing_fns.c | 30 +-- .../data_type_support/orte_dt_print_fns.c | 9 +- .../data_type_support/orte_dt_unpacking_fns.c | 34 +--- orte/runtime/orte_globals.c | 21 +-- orte/runtime/orte_globals.h | 35 ++-- orte/runtime/orte_mca_params.c | 43 ++--- orte/tools/orterun/orterun.c | 11 +- 13 files changed, 207 insertions(+), 252 deletions(-) diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index f93d86db7c..0613e9ef8d 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -583,9 +583,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, if( jdata->enable_recovery ) { /* is this a local proc */ if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its local restart limit */ + /* local proc - see if it has reached its restart limit */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_local_restarts) { + if (child->restarts < app->max_restarts) { child->restarts++; if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { return ORTE_SUCCESS; @@ -594,9 +594,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, * have cleared it */ child->state = state; - ORTE_ERROR_LOG(rc); - /* let it fall thru to abort */ - } else { /* see if we can relocate it somewhere else */ if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; @@ -1580,10 +1577,10 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, return ORTE_ERR_NOT_FOUND; } app_name = app->app; - /* track that we are attempting to relocate */ - pdata->relocates++; - /* have we exceeded the number of relocates for this proc? */ - if (app->max_global_restarts < pdata->relocates) { + /* track that we are attempting to restart */ + pdata->restarts++; + /* have we exceeded the number of restarts for this proc? */ + if (app->max_restarts < pdata->restarts) { return ORTE_ERR_RESTART_LIMIT_EXCEEDED; } diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index 8d2b6a5725..31dc86cf4a 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -310,7 +310,7 @@ static int update_state(orte_jobid_t job, killprocs(proc->jobid, proc->vpid); } app = jobdat->apps[child->app_idx]; - if( jobdat->enable_recovery && child->restarts < app->max_local_restarts ) { + if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { child->restarts++; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted restarting proc %s for the %d time", @@ -340,8 +340,8 @@ static int update_state(orte_jobid_t job, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s CHECKING RESTARTS %d VS MAX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - child->restarts, app->max_local_restarts)); - if (child->restarts < app->max_local_restarts ) { + child->restarts, app->max_restarts)); + if (child->restarts < app->max_restarts ) { /* attempt to restart it locally */ child->restarts++; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index a71fe04777..7244e72921 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -72,9 +72,9 @@ int orte_plm_base_setup_job(orte_job_t *jdata) { orte_job_t *jdatorted; orte_app_context_t *app; - int rc, tmp; + int rc; int32_t ljob; - orte_app_idx_t i; + int i; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:setup_job for job %s", @@ -93,34 +93,18 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); - /* see if recovery was set in the app */ - for (i=0; i < jdata->num_apps; i++) { + /* if job recovery is not defined, set it to default */ + if (!jdata->recovery_defined) { + /* set to system default */ + jdata->enable_recovery = orte_enable_recovery; + } + /* if app recovery is not defined, set apps to defaults */ + for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - /* big problem! */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + continue; } - if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) { - jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp); - } else { - jdata->enable_recovery = orte_enable_recovery; - } - if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) { - app->max_global_restarts = tmp; - } else { - app->max_global_restarts = orte_max_global_restarts; - } - if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) { - app->max_local_restarts = tmp; - } else { - app->max_local_restarts = orte_max_local_restarts; - - } - /* consistency check */ - if (app->max_global_restarts > 0 || - app->max_local_restarts > 0) { - jdata->enable_recovery = true; - + if (!app->recovery_defined) { + app->max_restarts = orte_max_restarts; } } } diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c index ab8c39f86b..84092f1bd0 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.c +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.c @@ -86,7 +86,7 @@ static void rml_callback_fn(int status, /* local globals */ static opal_event_t *send_ev = NULL, *check_ev = NULL; static struct timeval send_time, check_time; -static double timeout; +static orte_job_t *daemons; #include MCA_timer_IMPLEMENTATION_HEADER static inline double gettime(void) __opal_attribute_always_inline__; @@ -106,12 +106,18 @@ static inline double gettime(void) static int init(void) { - int rc; + int rc=ORTE_SUCCESS; OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s initializing heartbeat recvs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* get the daemon job object */ + if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + /* can't run */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } #if ORTE_ENABLE_MULTICAST /* setup multicast recv for heartbeats */ @@ -133,7 +139,7 @@ static int init(void) } } #endif - + return rc; } @@ -142,10 +148,12 @@ static void finalize(void) if (NULL != send_ev) { opal_event_del(send_ev); free(send_ev); + send_ev = NULL; } if (NULL != check_ev) { opal_event_del(check_ev); free(check_ev); + check_ev = NULL; } #if ORTE_ENABLE_MULTICAST @@ -156,46 +164,67 @@ static void finalize(void) return; } +static void setup_time(char *input, struct timeval *time) +{ + char **val; + + /* set default */ + time->tv_sec = 0; + time->tv_usec = 0; + + /* convert the rate to time */ + val = opal_argv_split(input, ':'); + if (NULL == val) { + /* nothing to do */ + return; + } + if (NULL != val[0]) { + time->tv_sec = strtol(val[0], NULL, 10); + } + if (NULL != val[1]) { + time->tv_usec = strtol(val[1], NULL, 10); + } +} + + /* * Start sending and checking heartbeats */ static void start(orte_jobid_t jobid) { - uint64_t time; - - if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { - /* heartbeats are only for daemons and HNPs */ + /* convert the send rate */ + setup_time(mca_sensor_heartbeat_component.rate, &send_time); + if (0 == send_time.tv_sec && + 0 == send_time.tv_usec) { + /* nothing to do */ return; } - + + if (!ORTE_PROC_IS_DAEMON) { + /* convert the check rate */ + setup_time(mca_sensor_heartbeat_component.check, &check_time); + if (0 == check_time.tv_sec && + 0 == check_time.tv_usec) { + /* no sense in running if we won't check */ + return; + } + + /* setup the check */ + check_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev); + opal_event_evtimer_add(check_ev, &check_time); + } + /* setup the send */ - time = mca_sensor_heartbeat_component.rate * 1000; /* convert to microsecs */ - send_ev = (opal_event_t *) malloc(sizeof(opal_event_t)); + send_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); opal_event_evtimer_set(opal_event_base, send_ev, send_heartbeat, send_ev); - send_time.tv_sec = time / 1000000; - send_time.tv_usec = time % 1000000; opal_event_evtimer_add(send_ev, &send_time); - /* define the timeout */ - timeout = 2.0 * (double)time; - - /* setup the check */ - time = mca_sensor_heartbeat_component.check * 1000; /* convert to microsecs */ - check_ev = (opal_event_t *) malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev); - check_time.tv_sec = time / 1000000; - check_time.tv_usec = time % 1000000; - opal_event_evtimer_add(check_ev, &check_time); } static void stop(orte_jobid_t jobid) { - if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { - /* heartbeats are only for daemons and HNPs */ - return; - } - if (NULL != send_ev) { opal_event_del(send_ev); free(send_ev); @@ -217,7 +246,7 @@ static void send_heartbeat(int fd, short event, void *arg) /* if we are aborting or shutting down, ignore this */ if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { - return; + goto reset; } OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, @@ -233,7 +262,7 @@ static void send_heartbeat(int fd, short event, void *arg) rmcast_callback_fn, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); - return; + goto reset; } #else /* send heartbeat to HNP */ @@ -242,10 +271,11 @@ static void send_heartbeat(int fd, short event, void *arg) rml_callback_fn, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); - return; + goto reset; } #endif + reset: /* reset the timer */ opal_event_evtimer_add(tmp, &send_time); } @@ -257,22 +287,26 @@ static void send_heartbeat(int fd, short event, void *arg) static void check_heartbeat(int fd, short dummy, void *arg) { int v; - orte_nid_t *nid; - double now; + orte_proc_t *proc; + time_t now; opal_event_t *tmp = (opal_event_t*)arg; orte_process_name_t name; - + double delta; + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sensor:check_heartbeat", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are aborting or shutting down, ignore this */ if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { - return; + goto reset; } name.jobid = ORTE_PROC_MY_NAME->jobid; + /* compute a send time interval */ + delta = send_time.tv_sec + (double)send_time.tv_usec/1000000.0; + /* get current time */ now = gettime(); @@ -280,26 +314,50 @@ static void check_heartbeat(int fd, short dummy, void *arg) * in case multiple daemons are late so all of those that did * can be appropriately flagged */ - for (v=0; v < orte_nidmap.size; v++) { - if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, v))) { + for (v=0; v < daemons->procs->size; v++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { continue; } - if (0 == nid->beat) { + /* ignore myself */ + if ((int)ORTE_PROC_MY_NAME->vpid == v) { + continue; + } + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s CHECKING HEARTBEAT FOR %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + + if (0 == proc->beat) { + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s NO BEAT YET", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* haven't recvd a beat yet */ continue; } - if ((now - nid->beat) > timeout) { - nid->missed++; - if (mca_sensor_heartbeat_component.missed < nid->missed) { - /* heartbeat failed */ - name.vpid = v; - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED, - &name, ORTE_PROC_STATE_HEARTBEAT_FAILED, - 0, ORTE_ERR_HEARTBEAT_LOST); - } + + /* compute number of heartbeats missed */ + proc->missed = (int)((double)(now - proc->beat) / delta); + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s MISSING %d BEATS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->missed)); + if (mca_sensor_heartbeat_component.missed < proc->missed) { + /* heartbeat failed */ + name.vpid = v; + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s sensor:check_heartbeat FAILED for daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name))); + orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED, + &name, ORTE_PROC_STATE_HEARTBEAT_FAILED, + 0, ORTE_ERR_HEARTBEAT_LOST); + /* zero the last beat to indicate we are waiting to recv + * the first beat from the restarted daemon + */ + proc->beat = 0; } } + reset: /* reset the timer */ opal_event_evtimer_add(tmp, &check_time); } @@ -312,7 +370,7 @@ static void recv_rmcast_beats(int status, orte_process_name_t *sender, opal_buffer_t *buf, void* cbdata) { - orte_nid_t *nid; + orte_proc_t *proc; /* if we are aborting or shutting down, ignore this */ if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { @@ -324,15 +382,13 @@ static void recv_rmcast_beats(int status, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - /* get this daemon's nid - if it isn't here, just ignore - * as this is caused by a race condition at startup - */ - if (NULL != (nid = orte_util_lookup_nid(sender))) { + /* get this daemon's object */ + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s updating beat time for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - nid->beat = gettime(); + proc->beat = gettime(); } } @@ -351,11 +407,11 @@ static void recv_rml_beats(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { - orte_nid_t *nid; + orte_proc_t *proc; /* if we are aborting or shutting down, ignore this */ if (orte_abnormal_term_ordered || orte_finalizing || !orte_intialized) { - return; + goto reset; } OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, @@ -363,22 +419,16 @@ static void recv_rml_beats(int status, orte_process_name_t* sender, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - /* get this daemon's nid - if it isn't here, just ignore - * as this is caused by a race condition at startup - */ - if (NULL != (nid = orte_util_lookup_nid(sender))) { + /* get this daemon's object */ + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s updating beat time for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - nid->beat = gettime(); - } else { - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s no nidmap entry for %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); + proc->beat = gettime(); } + reset: /* reissue the recv */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT, diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.h b/orte/mca/sensor/heartbeat/sensor_heartbeat.h index 2cda7be596..4a71ce630d 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.h +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.h @@ -23,8 +23,8 @@ BEGIN_C_DECLS struct orte_sensor_heartbeat_component_t { orte_sensor_base_component_t super; - int rate; - int check; + char *rate; + char *check; int missed; }; typedef struct orte_sensor_heartbeat_component_t orte_sensor_heartbeat_component_t; diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c b/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c index 493742d44a..aa4125e41d 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c @@ -58,19 +58,17 @@ static int orte_sensor_heartbeat_open(void) int tmp; /* lookup parameters */ - mca_base_param_reg_int(c, "rate", - "Heartbeat rate in milliseconds (default=0)", - false, false, 0, &tmp); - mca_sensor_heartbeat_component.rate = tmp; + mca_base_param_reg_string(c, "rate", + "Heartbeat rate in sec (default=0:0)", + false, false, "0:0", &mca_sensor_heartbeat_component.rate); - mca_base_param_reg_int(c, "check", - "Check for failure rate in milliseconds (default=500)", - false, false, 500, &tmp); - mca_sensor_heartbeat_component.check = tmp; + mca_base_param_reg_string(c, "check", + "Check for failure rate in sec:usec (default=1:0)", + false, false, "1:0", &mca_sensor_heartbeat_component.check); mca_base_param_reg_int(c, "missed", - "Number of missed heartbeats before failure is declared (default=5)", - false, false, 5, &tmp); + "Number of missed heartbeat checks before failure is declared (default=2)", + false, false, 2, &tmp); mca_sensor_heartbeat_component.missed = tmp; return ORTE_SUCCESS; @@ -79,18 +77,9 @@ static int orte_sensor_heartbeat_open(void) static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority) { - /* only usable by daemons and HNPs */ - if (0 < mca_sensor_heartbeat_component.rate && - (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP)) { - *priority = 10; /* use if we were built */ - *module = (mca_base_module_t *)&orte_sensor_heartbeat_module; - return ORTE_SUCCESS; - } - - /* otherwise, we are not available */ - *priority = 0; - *module = NULL; - return ORTE_ERROR; + *priority = 10; /* use if we were built */ + *module = (mca_base_module_t *)&orte_sensor_heartbeat_module; + return ORTE_SUCCESS; } /** diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index 326d2677ed..96734d4e68 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -284,6 +284,13 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } + /* pack the recovery policy defined flag */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, + (void*)(&(jobs[i]->recovery_defined)), 1, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the recovery flag */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(jobs[i]->enable_recovery)), 1, OPAL_BOOL))) { @@ -477,13 +484,6 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src, return rc; } - /* pack the number of relocates */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)&(procs[i]->relocates), 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - #if OPAL_ENABLE_FT_CR == 1 /* pack the ckpt state */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, @@ -749,21 +749,9 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src, } } - /* pack the restart limits */ + /* pack the restart limit */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(app_context[i]->max_local_restarts)), 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(app_context[i]->max_global_restarts)), 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* pack the constrain flag */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(app_context[i]->constrain)), 1, OPAL_BOOL))) { + (void*)(&(app_context[i]->max_restarts)), 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index 836ee55174..46a852a7f0 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -498,8 +498,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_ free(tmp); tmp = tmp2; - asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tRelocates: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2, - orte_proc_state_to_str(src->state), src->restarts, src->relocates, (long)src->app_idx, + asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2, + orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx, (NULL == src->slot_list) ? "NULL" : src->slot_list); free(tmp); @@ -528,10 +528,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d\tConstrain: %s", + asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Restarts: %d", pfx2, (NULL == src->name) ? "NULL" : src->name, (unsigned long)src->idx, (NULL == src->app) ? "NULL" : src->app, - pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts, - src->constrain ? "TRUE" : "FALSE"); + pfx2, (unsigned long)src->num_procs, src->max_restarts); count = opal_argv_count(src->argv); for (i=0; i < count; i++) { diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 533d2f827e..8202b05ff2 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -297,6 +297,14 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the recovery policy defined flag */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + (&(jobs[i]->recovery_defined)), &n, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack the recovery flag */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, @@ -520,14 +528,6 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest, return rc; } - /* unpack the number of relocates */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - (&(procs[i]->relocates)), &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - #if OPAL_ENABLE_FT_CR == 1 /* unpack the ckpt state */ if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, @@ -825,27 +825,13 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest, app_context[i]->preload_files_src_dir = NULL; } - /* unpack the restart limits */ + /* unpack the restart limit */ max_n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_local_restarts, + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_restarts, &max_n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } - max_n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_global_restarts, - &max_n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* unpack the constrain flag */ - max_n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->constrain, - &max_n, OPAL_BOOL))) { - ORTE_ERROR_LOG(rc); - return rc; - } #if OPAL_ENABLE_FT_CR == 1 /* Unpack the sstore_load */ diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 1e16f06385..326545441b 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -173,8 +173,7 @@ bool orte_do_not_barrier = false; /* process recovery */ bool orte_enable_recovery; -int32_t orte_max_global_restarts; -int32_t orte_max_local_restarts; +int32_t orte_max_restarts; /* comm fn for updating state */ orte_default_comm_fn_t orte_comm; @@ -183,9 +182,6 @@ orte_default_comm_fn_t orte_comm; bool orte_report_child_jobs_separately; struct timeval orte_child_time_to_exit; -/* orte progress threads */ -bool orte_progress_threads_enabled; - #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; @@ -567,9 +563,8 @@ static void orte_app_context_construct(orte_app_context_t* app_context) #if OPAL_ENABLE_FT_CR == 1 app_context->sstore_load = NULL; #endif - app_context->max_local_restarts = -1; - app_context->max_global_restarts = -1; - app_context->constrain = true; + app_context->recovery_defined = false; + app_context->max_restarts = -1000; } static void orte_app_context_destructor(orte_app_context_t* app_context) @@ -692,6 +687,7 @@ static void orte_job_construct(orte_job_t* job) OBJ_CONSTRUCT(&job->dyn_spawn_cond, opal_condition_t); job->dyn_spawn_active = false; + job->recovery_defined = false; job->enable_recovery = false; job->launch_msg_sent.tv_sec = 0; @@ -892,7 +888,10 @@ static void orte_proc_construct(orte_proc_t* proc) proc->nodename = NULL; proc->rml_uri = NULL; proc->restarts = 0; - proc->relocates = 0; +#if ORTE_ENABLE_HEARTBEAT + proc->beat = 0; + proc->missed = 0; +#endif #if OPAL_ENABLE_FT_CR == 1 proc->ckpt_state = 0; proc->ckpt_snapshot_ref = NULL; @@ -971,10 +970,6 @@ static void orte_nid_construct(orte_nid_t *ptr) ptr->oversubscribed = false; OBJ_CONSTRUCT(&ptr->attrs, opal_list_t); OBJ_CONSTRUCT(&ptr->sysinfo, opal_list_t); -#if ORTE_ENABLE_HEARTBEAT - ptr->beat = 0; - ptr->missed = 0; -#endif } static void orte_nid_destruct(orte_nid_t *ptr) diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 543e530f43..50ad26847e 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -227,15 +227,10 @@ typedef struct { /** What files SStore should load before local launch, if any */ char *sstore_load; #endif - /* max number of times a process can be restarted locally */ - int32_t max_local_restarts; - /* max number of times a process can be relocated to another node */ - int32_t max_global_restarts; - /* whether or not the procs in this app are constrained to stay - * on the specified nodes when restarted, or can move to any - * known node - */ - bool constrain; + /* recovery policy has been defined */ + bool recovery_defined; + /* max number of times a process can be restarted */ + int32_t max_restarts; } orte_app_context_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t); @@ -423,6 +418,8 @@ typedef struct { bool abort; /* proc that caused that to happen */ struct orte_proc_t *aborted_proc; + /* recovery policy has been defined */ + bool recovery_defined; /* enable recovery of these processes */ bool enable_recovery; /* time launch message was sent */ @@ -485,8 +482,12 @@ struct orte_proc_t { char *rml_uri; /* number of times this process has been restarted */ int32_t restarts; - /* number of times this process has been relocated */ - int32_t relocates; +#if ORTE_ENABLE_HEARTBEAT + /* time when last heartbeat was detected */ + double beat; + /* number of missed heartbeats */ + int missed; +#endif #if OPAL_ENABLE_FT_CR == 1 /* ckpt state */ size_t ckpt_state; @@ -522,12 +523,6 @@ typedef struct { opal_list_t attrs; /* list of system info */ opal_list_t sysinfo; -#if ORTE_ENABLE_HEARTBEAT - /* seconds when last heartbeat was detected */ - double beat; - /* number of missed heartbeats */ - int missed; -#endif } orte_nid_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t); @@ -704,8 +699,7 @@ ORTE_DECLSPEC extern bool orte_do_not_barrier; /* process recovery */ ORTE_DECLSPEC extern bool orte_enable_recovery; -ORTE_DECLSPEC extern int32_t orte_max_global_restarts; -ORTE_DECLSPEC extern int32_t orte_max_local_restarts; +ORTE_DECLSPEC extern int32_t orte_max_restarts; /* comm interface */ typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data); @@ -724,9 +718,6 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient, ORTE_DECLSPEC extern bool orte_report_child_jobs_separately; ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit; -/* orte progress threads */ -ORTE_DECLSPEC extern bool orte_progress_threads_enabled; - #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 39208c258e..a6e88ad27a 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -458,46 +458,38 @@ int orte_register_params(void) (int)false, &value); orte_enable_recovery = OPAL_INT_TO_BOOL(value); - mca_base_param_reg_int_name("orte", "max_global_restarts", - "Max number of times to relocate a failed process to a new node", + mca_base_param_reg_int_name("orte", "max_restarts", + "Max number of times to restart a failed process", false, false, - -1, &orte_max_global_restarts); + -1, &orte_max_restarts); - mca_base_param_reg_int_name("orte", "max_local_restarts", - "Max number of times to locally restart a failed process before relocating it to a new node", - false, false, - -1, &orte_max_local_restarts); if (orte_enable_recovery) { - if (orte_max_global_restarts <= 0 && - orte_max_local_restarts <= 0) { + if (orte_max_restarts <= 0) { if (ORTE_PROC_IS_HNP) { opal_output(orte_clean_output, "------------------------------------------------------------\n" "Although the MCA param orte_enable_recovery was set to true,\n" - "values for the max number of restarts was not provided:\n\n" - "Max global restarts: %d\n" - "Max local restarts: %d\n\n" - "At least one of these must be a positive value. We are disabling\n" + "a value for the max number of restarts was not provided:\n\n" + "Max restarts: %d\n" + "This must be a positive value. We are disabling\n" "process recovery, but continuing execution.\n" "------------------------------------------------------------", - orte_max_global_restarts, orte_max_local_restarts); + orte_max_restarts); } orte_enable_recovery = false; } - } else if (orte_max_global_restarts > 0 || - orte_max_local_restarts > 0) { + } else if (orte_max_restarts > 0) { if (ORTE_PROC_IS_HNP) { opal_output(orte_clean_output, "------------------------------------------------------------------\n" "The MCA param orte_enable_recovery was not set to true, but\n" - "positive value(s) were provided for the number of restarts:\n\n" - "Max global restarts: %d\n" - "Max local restarts: %d\n\n" + "a positive value was provided for the number of restarts:\n\n" + "Max restarts: %d\n" "We are enabling process recovery and continuing execution. To avoid\n" "this warning in the future, please set the orte_enable_recovery\n" "param to non-zero.\n" "------------------------------------------------------------------", - orte_max_global_restarts, orte_max_local_restarts); + orte_max_restarts); } orte_enable_recovery = true; } @@ -514,17 +506,6 @@ int orte_register_params(void) INT_MAX, &value); orte_child_time_to_exit.tv_sec = value; orte_child_time_to_exit.tv_usec = 0; - - mca_base_param_reg_int_name("orte", "enable_progress_threads", - "Enable the use of ORTE progress threads in applications", - false, false, - (int)false, &value); - - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { - orte_progress_threads_enabled = true; - } else { - orte_progress_threads_enabled = OPAL_INT_TO_BOOL(value); - } #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index ae6e34a7a9..5f51913c97 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -425,13 +425,9 @@ static opal_cmd_line_init_t cmd_line_init[] = { NULL, OPAL_CMD_LINE_TYPE_BOOL, "Enable recovery from process failure [Default = disabled]" }, - { "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1, + { "orte", "max", "restarts", '\0', "max-restarts", "max-restarts", 1, NULL, OPAL_CMD_LINE_TYPE_INT, - "Max number of times to relocate a failed process to a new node" }, - - { "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Max number of times to locally restart a failed process before relocating it to a new node" }, + "Max number of times to restart a failed process" }, #if OPAL_ENABLE_CRDEBUG == 1 { "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0, @@ -955,8 +951,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) /* if recovery was disabled on the cmd line, do so */ if (orterun_globals.disable_recovery) { orte_enable_recovery = false; - orte_max_local_restarts = 0; - orte_max_global_restarts = 0; + orte_max_restarts = 0; } return ORTE_SUCCESS;