Let max restarts be associated with an app_context instead of a job so that individual apps can have different values. Default to a single job-level value
This commit was SVN r23248.
Этот коммит содержится в:
родитель
799a77a187
Коммит
e52a54183f
@ -120,6 +120,7 @@ static int update_state(orte_jobid_t job,
|
||||
orte_exit_code_t sts;
|
||||
orte_odls_child_t *child;
|
||||
int rc;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
@ -297,7 +298,8 @@ static int update_state(orte_jobid_t job,
|
||||
/* is this a local proc */
|
||||
if (NULL != (child = proc_is_local(proc))) {
|
||||
/* local proc - see if it has reached its local restart limit */
|
||||
if (child->restarts < jdata->max_local_restarts) {
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx);
|
||||
if (child->restarts < app->max_local_restarts) {
|
||||
child->restarts++;
|
||||
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
||||
return ORTE_SUCCESS;
|
||||
@ -1074,7 +1076,8 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
|
||||
/* track that we are attempting to relocate */
|
||||
pdata->relocates++;
|
||||
/* have we exceeded the number of relocates for this proc? */
|
||||
if (jdata->max_global_restarts < pdata->relocates) {
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
|
||||
if (app->max_global_restarts < pdata->relocates) {
|
||||
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
|
@ -119,7 +119,8 @@ static int update_state(orte_jobid_t job,
|
||||
orte_plm_cmd_flag_t cmd;
|
||||
int rc=ORTE_SUCCESS;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
|
||||
@ -292,7 +293,8 @@ static int update_state(orte_jobid_t job,
|
||||
/* kill this proc */
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
}
|
||||
if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) {
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted restarting proc %s for the %d time",
|
||||
@ -315,7 +317,8 @@ static int update_state(orte_jobid_t job,
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
if (child->restarts == jobdat->max_local_restarts ) {
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if (child->restarts == app->max_local_restarts ) {
|
||||
goto REPORT_ABORT;
|
||||
}
|
||||
/* otherwise, attempt to restart it locally */
|
||||
|
@ -386,12 +386,6 @@ pack_add_procs:
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the max number of local restarts allowed for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->max_local_restarts, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of app_contexts for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -850,12 +844,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the max number of local restarts allowed for this job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->max_local_restarts, &cnt, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the number of app_contexts for this job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) {
|
||||
@ -2247,7 +2235,8 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
flag = 1;
|
||||
opal_dss.pack(&buffer, &flag, 1, OPAL_INT8);
|
||||
opal_dss.pack(&buffer, &jobdat->regexp, 1, OPAL_STRING);
|
||||
} else {
|
||||
} else if (NULL != orte_odls_globals.dmap &&
|
||||
NULL != jobdat->pmap) {
|
||||
/* the data is in the local byte objects - send them */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:sync sending byte object",
|
||||
|
@ -314,7 +314,6 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
ptr->num_participating = -1;
|
||||
ptr->num_collected = 0;
|
||||
ptr->enable_recovery = false;
|
||||
ptr->max_local_restarts = 0;
|
||||
}
|
||||
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
|
||||
{
|
||||
|
@ -144,7 +144,6 @@ typedef struct orte_odls_job_t {
|
||||
int num_collected;
|
||||
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
||||
bool enable_recovery; /* enable recovery of failed processes */
|
||||
int32_t max_local_restarts; /* max number of times a local proc can be restarted */
|
||||
} orte_odls_job_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);
|
||||
|
||||
|
@ -74,6 +74,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
orte_app_context_t *app;
|
||||
int rc, tmp;
|
||||
int32_t ljob;
|
||||
orte_app_idx_t i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:setup_job for job %s",
|
||||
@ -93,35 +94,34 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* see if recovery was set in the app */
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
|
||||
/* big problem! */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) {
|
||||
jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp);
|
||||
} else {
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) {
|
||||
jdata->max_global_restarts = tmp;
|
||||
} else {
|
||||
jdata->max_global_restarts = orte_max_global_restarts;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) {
|
||||
jdata->max_local_restarts = tmp;
|
||||
} else {
|
||||
jdata->max_local_restarts = orte_max_local_restarts;
|
||||
|
||||
}
|
||||
|
||||
/* consistency check */
|
||||
if (jdata->max_global_restarts <= 0 &&
|
||||
jdata->max_local_restarts <= 0) {
|
||||
jdata->enable_recovery = false;
|
||||
|
||||
} else {
|
||||
jdata->enable_recovery = true;
|
||||
for (i=0; i < jdata->num_apps; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
/* big problem! */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) {
|
||||
jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp);
|
||||
} else {
|
||||
jdata->enable_recovery = orte_enable_recovery;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) {
|
||||
app->max_global_restarts = tmp;
|
||||
} else {
|
||||
app->max_global_restarts = orte_max_global_restarts;
|
||||
}
|
||||
if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) {
|
||||
app->max_local_restarts = tmp;
|
||||
} else {
|
||||
app->max_local_restarts = orte_max_local_restarts;
|
||||
|
||||
}
|
||||
/* consistency check */
|
||||
if (app->max_global_restarts > 0 ||
|
||||
app->max_local_restarts > 0) {
|
||||
jdata->enable_recovery = true;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -306,20 +306,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the max local restarts */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->max_local_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the max global restarts */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->max_global_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* pack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
@ -555,6 +541,13 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
|
||||
app_context = (orte_app_context_t**) src;
|
||||
|
||||
for (i=0; i < num_vals; i++) {
|
||||
/* pack the user's name for this app */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->name)), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the application index (for multiapp jobs) */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->idx)), 1, ORTE_STD_CNTR))) {
|
||||
@ -770,6 +763,19 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* pack the restart limits */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->max_local_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(app_context[i]->max_global_restarts)), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -216,8 +216,8 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->jobid),
|
||||
asprintf(&tmp, "\n%sData for job: %s\tName: %s\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->jobid), src->name, pfx2,
|
||||
(long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target),
|
||||
orte_job_state_to_str(src->state), src->abort ? "True" : "False");
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
@ -249,8 +249,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
tmp = tmp2;
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Local Restarts: %d\tMax Global Restarts %d", tmp, pfx,
|
||||
(long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
|
||||
asprintf(&tmp2, "%s\n%sNum procs: %ld", tmp, pfx, (long)src->num_procs);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
@ -525,9 +524,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for app_context: index %lu\tapp: %s\n%s\tNum procs: %lu",
|
||||
pfx2, (unsigned long)src->idx, src->app,
|
||||
pfx2, (unsigned long)src->num_procs);
|
||||
asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d",
|
||||
pfx2, src->name, (unsigned long)src->idx, src->app,
|
||||
pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts);
|
||||
|
||||
count = opal_argv_count(src->argv);
|
||||
for (i=0; i < count; i++) {
|
||||
|
@ -308,22 +308,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the max local restarts */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->max_local_restarts)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the max global restarts */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->max_global_restarts)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* unpack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
@ -596,6 +580,14 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* get the name */
|
||||
max_n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->name),
|
||||
&max_n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* get the app index number */
|
||||
max_n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->idx),
|
||||
@ -836,6 +828,20 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
|
||||
app_context[i]->preload_files_src_dir = NULL;
|
||||
}
|
||||
|
||||
/* unpack the restart limits */
|
||||
max_n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_local_restarts,
|
||||
&max_n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
max_n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_global_restarts,
|
||||
&max_n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -517,6 +517,7 @@ int orte_global_comm(orte_process_name_t *recipient,
|
||||
|
||||
static void orte_app_context_construct(orte_app_context_t* app_context)
|
||||
{
|
||||
app_context->name = NULL;
|
||||
app_context->idx=0;
|
||||
app_context->app=NULL;
|
||||
app_context->num_procs=0;
|
||||
@ -534,10 +535,16 @@ static void orte_app_context_construct(orte_app_context_t* app_context)
|
||||
app_context->preload_files_dest_dir = NULL;
|
||||
app_context->preload_files_src_dir = NULL;
|
||||
app_context->used_on_node = false;
|
||||
app_context->max_local_restarts = -1;
|
||||
app_context->max_global_restarts = -1;
|
||||
}
|
||||
|
||||
static void orte_app_context_destructor(orte_app_context_t* app_context)
|
||||
{
|
||||
if (NULL != app_context->name) {
|
||||
free(app_context->name);
|
||||
}
|
||||
|
||||
if (NULL != app_context->app) {
|
||||
free (app_context->app);
|
||||
app_context->app = NULL;
|
||||
@ -610,6 +617,7 @@ OBJ_CLASS_INSTANCE(orte_app_context_t,
|
||||
|
||||
static void orte_job_construct(orte_job_t* job)
|
||||
{
|
||||
job->name = NULL;
|
||||
job->jobid = ORTE_JOBID_INVALID;
|
||||
job->apps = OBJ_NEW(opal_pointer_array_t);
|
||||
opal_pointer_array_init(job->apps,
|
||||
@ -644,8 +652,6 @@ static void orte_job_construct(orte_job_t* job)
|
||||
job->not_reported = true;
|
||||
|
||||
job->enable_recovery = false;
|
||||
job->max_local_restarts = -1;
|
||||
job->max_global_restarts = -1;
|
||||
|
||||
job->launch_msg_sent.tv_sec = 0;
|
||||
job->launch_msg_sent.tv_usec = 0;
|
||||
|
@ -170,6 +170,11 @@ struct orte_job_map_t;
|
||||
typedef struct {
|
||||
/** Parent object */
|
||||
opal_object_t super;
|
||||
/** unique name for this application - has
|
||||
* nothing to do with argv[0], but has meaning
|
||||
* to the user, if provided
|
||||
*/
|
||||
char *name;
|
||||
/** Unique index when multiple apps per job */
|
||||
orte_app_idx_t idx;
|
||||
/** Absolute pathname of argv[0] */
|
||||
@ -208,6 +213,10 @@ typedef struct {
|
||||
char *preload_files_src_dir;
|
||||
/* is being used on the local node */
|
||||
bool used_on_node;
|
||||
/* max number of times a process can be restarted locally */
|
||||
int32_t max_local_restarts;
|
||||
/* max number of times a process can be relocated to another node */
|
||||
int32_t max_global_restarts;
|
||||
} orte_app_context_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
|
||||
@ -341,6 +350,8 @@ typedef uint8_t orte_job_controls_t;
|
||||
typedef struct {
|
||||
/** Base object so this can be put on a list */
|
||||
opal_list_item_t super;
|
||||
/* a name for this job */
|
||||
char *name;
|
||||
/* jobid for this job */
|
||||
orte_jobid_t jobid;
|
||||
/* app_context array for this job */
|
||||
@ -392,14 +403,14 @@ typedef struct {
|
||||
struct orte_proc_t *aborted_proc;
|
||||
/* enable recovery of these processes */
|
||||
bool enable_recovery;
|
||||
/* max number of times a process can be restarted locally */
|
||||
int32_t max_local_restarts;
|
||||
/* max number of times a process can be relocated to another node */
|
||||
int32_t max_global_restarts;
|
||||
/* time launch message was sent */
|
||||
struct timeval launch_msg_sent;
|
||||
/* max time for launch msg to be received */
|
||||
struct timeval max_launch_msg_recvd;
|
||||
/* uid under which to run the job */
|
||||
int32_t uid;
|
||||
/* gid under which to run the job */
|
||||
int32_t gid;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* ckpt state */
|
||||
size_t ckpt_state;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user