1
1

Let max restarts be associated with an app_context instead of a job so that individual apps can have different values. Default to a single job-level value

This commit was SVN r23248.
Этот коммит содержится в:
Ralph Castain 2010-06-07 14:21:08 +00:00
родитель 799a77a187
Коммит e52a54183f
11 изменённых файлов: 113 добавлений и 92 удалений

Просмотреть файл

@ -120,6 +120,7 @@ static int update_state(orte_jobid_t job,
orte_exit_code_t sts;
orte_odls_child_t *child;
int rc;
orte_app_context_t *app;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
@ -297,7 +298,8 @@ static int update_state(orte_jobid_t job,
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its local restart limit */
if (child->restarts < jdata->max_local_restarts) {
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
if (child->restarts < app->max_local_restarts) {
child->restarts++;
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS;
@ -1074,7 +1076,8 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
/* track that we are attempting to relocate */
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
if (jdata->max_global_restarts < pdata->relocates) {
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
if (app->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
}

Просмотреть файл

@ -119,7 +119,8 @@ static int update_state(orte_jobid_t job,
orte_plm_cmd_flag_t cmd;
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_app_context_t *app;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
@ -292,7 +293,8 @@ static int update_state(orte_jobid_t job,
/* kill this proc */
killprocs(proc->jobid, proc->vpid);
}
if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) {
app = jobdat->apps[child->app_idx];
if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
@ -315,7 +317,8 @@ static int update_state(orte_jobid_t job,
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
/* see if this child has reached its local restart limit */
if (child->restarts == jobdat->max_local_restarts ) {
app = jobdat->apps[child->app_idx];
if (child->restarts == app->max_local_restarts ) {
goto REPORT_ABORT;
}
/* otherwise, attempt to restart it locally */

Просмотреть файл

@ -386,12 +386,6 @@ pack_add_procs:
return rc;
}
/* pack the max number of local restarts allowed for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->max_local_restarts, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the number of app_contexts for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
@ -850,12 +844,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the max number of local restarts allowed for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->max_local_restarts, &cnt, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the number of app_contexts for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) {
@ -2247,7 +2235,8 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
flag = 1;
opal_dss.pack(&buffer, &flag, 1, OPAL_INT8);
opal_dss.pack(&buffer, &jobdat->regexp, 1, OPAL_STRING);
} else {
} else if (NULL != orte_odls_globals.dmap &&
NULL != jobdat->pmap) {
/* the data is in the local byte objects - send them */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:sync sending byte object",

Просмотреть файл

@ -314,7 +314,6 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
ptr->num_participating = -1;
ptr->num_collected = 0;
ptr->enable_recovery = false;
ptr->max_local_restarts = 0;
}
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
{

Просмотреть файл

@ -144,7 +144,6 @@ typedef struct orte_odls_job_t {
int num_collected;
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
bool enable_recovery; /* enable recovery of failed processes */
int32_t max_local_restarts; /* max number of times a local proc can be restarted */
} orte_odls_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);

Просмотреть файл

@ -74,6 +74,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
orte_app_context_t *app;
int rc, tmp;
int32_t ljob;
orte_app_idx_t i;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_job for job %s",
@ -93,35 +94,34 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* see if recovery was set in the app */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
/* big problem! */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) {
jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp);
} else {
jdata->enable_recovery = orte_enable_recovery;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) {
jdata->max_global_restarts = tmp;
} else {
jdata->max_global_restarts = orte_max_global_restarts;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) {
jdata->max_local_restarts = tmp;
} else {
jdata->max_local_restarts = orte_max_local_restarts;
}
/* consistency check */
if (jdata->max_global_restarts <= 0 &&
jdata->max_local_restarts <= 0) {
jdata->enable_recovery = false;
} else {
jdata->enable_recovery = true;
for (i=0; i < jdata->num_apps; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
/* big problem! */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) {
jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp);
} else {
jdata->enable_recovery = orte_enable_recovery;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) {
app->max_global_restarts = tmp;
} else {
app->max_global_restarts = orte_max_global_restarts;
}
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) {
app->max_local_restarts = tmp;
} else {
app->max_local_restarts = orte_max_local_restarts;
}
/* consistency check */
if (app->max_global_restarts > 0 ||
app->max_local_restarts > 0) {
jdata->enable_recovery = true;
}
}
}

Просмотреть файл

@ -306,20 +306,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
return rc;
}
/* pack the max local restarts */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->max_local_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the max global restarts */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->max_global_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1
/* pack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
@ -555,6 +541,13 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
app_context = (orte_app_context_t**) src;
for (i=0; i < num_vals; i++) {
/* pack the user's name for this app */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->name)), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the application index (for multiapp jobs) */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->idx)), 1, ORTE_STD_CNTR))) {
@ -770,6 +763,19 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
return rc;
}
}
/* pack the restart limits */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->max_local_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(app_context[i]->max_global_restarts)), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;

Просмотреть файл

@ -216,8 +216,8 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
asprintf(&pfx2, "%s", prefix);
}
asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
ORTE_JOBID_PRINT(src->jobid),
asprintf(&tmp, "\n%sData for job: %s\tName: %s\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
ORTE_JOBID_PRINT(src->jobid), src->name, pfx2,
(long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target),
orte_job_state_to_str(src->state), src->abort ? "True" : "False");
asprintf(&pfx, "%s\t", pfx2);
@ -249,8 +249,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
tmp = tmp2;
}
asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Local Restarts: %d\tMax Global Restarts %d", tmp, pfx,
(long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
asprintf(&tmp2, "%s\n%sNum procs: %ld", tmp, pfx, (long)src->num_procs);
free(tmp);
tmp = tmp2;
@ -525,9 +524,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
asprintf(&pfx2, "%s", prefix);
}
asprintf(&tmp, "\n%sData for app_context: index %lu\tapp: %s\n%s\tNum procs: %lu",
pfx2, (unsigned long)src->idx, src->app,
pfx2, (unsigned long)src->num_procs);
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d",
pfx2, src->name, (unsigned long)src->idx, src->app,
pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
count = opal_argv_count(src->argv);
for (i=0; i < count; i++) {

Просмотреть файл

@ -308,22 +308,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
return rc;
}
/* unpack the max local restarts */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->max_local_restarts)), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the max global restarts */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->max_global_restarts)), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_ENABLE_FT_CR == 1
/* unpack the ckpt state */
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -596,6 +580,14 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* get the name */
max_n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->name),
&max_n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* get the app index number */
max_n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->idx),
@ -836,6 +828,20 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
app_context[i]->preload_files_src_dir = NULL;
}
/* unpack the restart limits */
max_n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_local_restarts,
&max_n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
max_n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_global_restarts,
&max_n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;

Просмотреть файл

@ -517,6 +517,7 @@ int orte_global_comm(orte_process_name_t *recipient,
static void orte_app_context_construct(orte_app_context_t* app_context)
{
app_context->name = NULL;
app_context->idx=0;
app_context->app=NULL;
app_context->num_procs=0;
@ -534,10 +535,16 @@ static void orte_app_context_construct(orte_app_context_t* app_context)
app_context->preload_files_dest_dir = NULL;
app_context->preload_files_src_dir = NULL;
app_context->used_on_node = false;
app_context->max_local_restarts = -1;
app_context->max_global_restarts = -1;
}
static void orte_app_context_destructor(orte_app_context_t* app_context)
{
if (NULL != app_context->name) {
free(app_context->name);
}
if (NULL != app_context->app) {
free (app_context->app);
app_context->app = NULL;
@ -610,6 +617,7 @@ OBJ_CLASS_INSTANCE(orte_app_context_t,
static void orte_job_construct(orte_job_t* job)
{
job->name = NULL;
job->jobid = ORTE_JOBID_INVALID;
job->apps = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_init(job->apps,
@ -644,8 +652,6 @@ static void orte_job_construct(orte_job_t* job)
job->not_reported = true;
job->enable_recovery = false;
job->max_local_restarts = -1;
job->max_global_restarts = -1;
job->launch_msg_sent.tv_sec = 0;
job->launch_msg_sent.tv_usec = 0;

Просмотреть файл

@ -170,6 +170,11 @@ struct orte_job_map_t;
typedef struct {
/** Parent object */
opal_object_t super;
/** unique name for this application - has
* nothing to do with argv[0], but has meaning
* to the user, if provided
*/
char *name;
/** Unique index when multiple apps per job */
orte_app_idx_t idx;
/** Absolute pathname of argv[0] */
@ -208,6 +213,10 @@ typedef struct {
char *preload_files_src_dir;
/* is being used on the local node */
bool used_on_node;
/* max number of times a process can be restarted locally */
int32_t max_local_restarts;
/* max number of times a process can be relocated to another node */
int32_t max_global_restarts;
} orte_app_context_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
@ -341,6 +350,8 @@ typedef uint8_t orte_job_controls_t;
typedef struct {
/** Base object so this can be put on a list */
opal_list_item_t super;
/* a name for this job */
char *name;
/* jobid for this job */
orte_jobid_t jobid;
/* app_context array for this job */
@ -392,14 +403,14 @@ typedef struct {
struct orte_proc_t *aborted_proc;
/* enable recovery of these processes */
bool enable_recovery;
/* max number of times a process can be restarted locally */
int32_t max_local_restarts;
/* max number of times a process can be relocated to another node */
int32_t max_global_restarts;
/* time launch message was sent */
struct timeval launch_msg_sent;
/* max time for launch msg to be received */
struct timeval max_launch_msg_recvd;
/* uid under which to run the job */
int32_t uid;
/* gid under which to run the job */
int32_t gid;
#if OPAL_ENABLE_FT_CR == 1
/* ckpt state */
size_t ckpt_state;