From e52a54183fce06b201cf2bb1b529bc43a05d7edc Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 7 Jun 2010 14:21:08 +0000 Subject: [PATCH] Let max restarts be associated with an app_context instead of a job so that individual apps can have different values. Default to a single job-level value This commit was SVN r23248. --- orte/mca/errmgr/hnp/errmgr_hnp.c | 7 ++- orte/mca/errmgr/orted/errmgr_orted.c | 9 ++- orte/mca/odls/base/odls_base_default_fns.c | 15 +---- orte/mca/odls/base/odls_base_open.c | 1 - orte/mca/odls/odls_types.h | 1 - orte/mca/plm/base/plm_base_launch_support.c | 58 +++++++++---------- .../data_type_support/orte_dt_packing_fns.c | 34 ++++++----- .../data_type_support/orte_dt_print_fns.c | 13 ++--- .../data_type_support/orte_dt_unpacking_fns.c | 38 +++++++----- orte/runtime/orte_globals.c | 10 +++- orte/runtime/orte_globals.h | 19 ++++-- 11 files changed, 113 insertions(+), 92 deletions(-) diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index ed0007603f..d01bbd7f8b 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -120,6 +120,7 @@ static int update_state(orte_jobid_t job, orte_exit_code_t sts; orte_odls_child_t *child; int rc; + orte_app_context_t *app; /* indicate that this is the end of the line */ *stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE; @@ -297,7 +298,8 @@ static int update_state(orte_jobid_t job, /* is this a local proc */ if (NULL != (child = proc_is_local(proc))) { /* local proc - see if it has reached its local restart limit */ - if (child->restarts < jdata->max_local_restarts) { + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_local_restarts) { child->restarts++; if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { return ORTE_SUCCESS; @@ -1074,7 +1076,8 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) /* track that we are attempting to relocate */ pdata->relocates++; /* have we exceeded the number of relocates for this proc? */ - if (jdata->max_global_restarts < pdata->relocates) { + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); + if (app->max_global_restarts < pdata->relocates) { return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; } diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index 0a352bc8b6..551cd2bb33 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -119,7 +119,8 @@ static int update_state(orte_jobid_t job, orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; - + orte_app_context_t *app; + /* indicate that this is the end of the line */ *stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE; @@ -292,7 +293,8 @@ static int update_state(orte_jobid_t job, /* kill this proc */ killprocs(proc->jobid, proc->vpid); } - if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) { + app = jobdat->apps[child->app_idx]; + if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) { child->restarts++; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted restarting proc %s for the %d time", @@ -315,7 +317,8 @@ static int update_state(orte_jobid_t job, if (child->name->jobid == proc->jobid && child->name->vpid == proc->vpid) { /* see if this child has reached its local restart limit */ - if (child->restarts == jobdat->max_local_restarts ) { + app = jobdat->apps[child->app_idx]; + if (child->restarts == app->max_local_restarts ) { goto REPORT_ABORT; } /* otherwise, attempt to restart it locally */ diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 4caf4b7455..92ca38862e 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -386,12 +386,6 @@ pack_add_procs: return rc; } - /* pack the max number of local restarts allowed for this job */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->max_local_restarts, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the number of app_contexts for this job */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) { ORTE_ERROR_LOG(rc); @@ -850,12 +844,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } - /* unpack the max number of local restarts allowed for this job */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->max_local_restarts, &cnt, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } /* unpack the number of app_contexts for this job */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) { @@ -2247,7 +2235,8 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, flag = 1; opal_dss.pack(&buffer, &flag, 1, OPAL_INT8); opal_dss.pack(&buffer, &jobdat->regexp, 1, OPAL_STRING); - } else { + } else if (NULL != orte_odls_globals.dmap && + NULL != jobdat->pmap) { /* the data is in the local byte objects - send them */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:sync sending byte object", diff --git a/orte/mca/odls/base/odls_base_open.c b/orte/mca/odls/base/odls_base_open.c index 33099c45a5..2447f158ff 100644 --- a/orte/mca/odls/base/odls_base_open.c +++ b/orte/mca/odls/base/odls_base_open.c @@ -314,7 +314,6 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr) ptr->num_participating = -1; ptr->num_collected = 0; ptr->enable_recovery = false; - ptr->max_local_restarts = 0; } static void orte_odls_job_destructor(orte_odls_job_t *ptr) { diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 46325268b1..e3162f94fe 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -144,7 +144,6 @@ typedef struct orte_odls_job_t { int num_collected; struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */ bool enable_recovery; /* enable recovery of failed processes */ - int32_t max_local_restarts; /* max number of times a local proc can be restarted */ } orte_odls_job_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 5043ca45ff..214ad5debd 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -74,6 +74,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata) orte_app_context_t *app; int rc, tmp; int32_t ljob; + orte_app_idx_t i; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:setup_job for job %s", @@ -93,35 +94,34 @@ int orte_plm_base_setup_job(orte_job_t *jdata) opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* see if recovery was set in the app */ - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) { - /* big problem! */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) { - jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp); - } else { - jdata->enable_recovery = orte_enable_recovery; - } - if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) { - jdata->max_global_restarts = tmp; - } else { - jdata->max_global_restarts = orte_max_global_restarts; - } - if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) { - jdata->max_local_restarts = tmp; - } else { - jdata->max_local_restarts = orte_max_local_restarts; - - } - - /* consistency check */ - if (jdata->max_global_restarts <= 0 && - jdata->max_local_restarts <= 0) { - jdata->enable_recovery = false; - - } else { - jdata->enable_recovery = true; + for (i=0; i < jdata->num_apps; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + /* big problem! */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "enable_recovery", app->env, &tmp)) { + jdata->enable_recovery = OPAL_INT_TO_BOOL(tmp); + } else { + jdata->enable_recovery = orte_enable_recovery; + } + if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_global_restarts", app->env, &tmp)) { + app->max_global_restarts = tmp; + } else { + app->max_global_restarts = orte_max_global_restarts; + } + if (ORTE_SUCCESS == mca_base_param_find_int_name("orte", "max_local_restarts", app->env, &tmp)) { + app->max_local_restarts = tmp; + } else { + app->max_local_restarts = orte_max_local_restarts; + + } + /* consistency check */ + if (app->max_global_restarts > 0 || + app->max_local_restarts > 0) { + jdata->enable_recovery = true; + + } } } diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index 1c6007dc0f..c907ff0795 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -306,20 +306,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } - /* pack the max local restarts */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(jobs[i]->max_local_restarts)), 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* pack the max global restarts */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(jobs[i]->max_global_restarts)), 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - #if OPAL_ENABLE_FT_CR == 1 /* pack the ckpt state */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, @@ -555,6 +541,13 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src, app_context = (orte_app_context_t**) src; for (i=0; i < num_vals; i++) { + /* pack the user's name for this app */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, + (void*)(&(app_context[i]->name)), 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the application index (for multiapp jobs) */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(app_context[i]->idx)), 1, ORTE_STD_CNTR))) { @@ -770,6 +763,19 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src, return rc; } } + + /* pack the restart limits */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, + (void*)(&(app_context[i]->max_local_restarts)), 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, + (void*)(&(app_context[i]->max_global_restarts)), 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } return ORTE_SUCCESS; diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index 9c8eab84ed..cdd221fc6f 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -216,8 +216,8 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "\n%sData for job: %s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2, - ORTE_JOBID_PRINT(src->jobid), + asprintf(&tmp, "\n%sData for job: %s\tName: %s\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2, + ORTE_JOBID_PRINT(src->jobid), src->name, pfx2, (long)src->num_apps, src->controls, ORTE_VPID_PRINT(src->stdin_target), orte_job_state_to_str(src->state), src->abort ? "True" : "False"); asprintf(&pfx, "%s\t", pfx2); @@ -249,8 +249,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty tmp = tmp2; } - asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Local Restarts: %d\tMax Global Restarts %d", tmp, pfx, - (long)src->num_procs, src->max_local_restarts, src->max_global_restarts); + asprintf(&tmp2, "%s\n%sNum procs: %ld", tmp, pfx, (long)src->num_procs); free(tmp); tmp = tmp2; @@ -525,9 +524,9 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "\n%sData for app_context: index %lu\tapp: %s\n%s\tNum procs: %lu", - pfx2, (unsigned long)src->idx, src->app, - pfx2, (unsigned long)src->num_procs); + asprintf(&tmp, "\n%sData for app_context: name: %s\t index %lu\tapp: %s\n%s\tNum procs: %lu\tMax Local Restarts: %d\tMax Global Restarts %d", + pfx2, src->name, (unsigned long)src->idx, src->app, + pfx2, (unsigned long)src->num_procs, src->max_local_restarts, src->max_global_restarts); count = opal_argv_count(src->argv); for (i=0; i < count; i++) { diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index eba6012372..b49ce89a1b 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -308,22 +308,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } - /* unpack the max local restarts */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - (&(jobs[i]->max_local_restarts)), &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* unpack the max global restarts */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - (&(jobs[i]->max_global_restarts)), &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - #if OPAL_ENABLE_FT_CR == 1 /* unpack the ckpt state */ if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, @@ -596,6 +580,14 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest, return ORTE_ERR_OUT_OF_RESOURCE; } + /* get the name */ + max_n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->name), + &max_n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* get the app index number */ max_n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->idx), @@ -836,6 +828,20 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest, app_context[i]->preload_files_src_dir = NULL; } + /* unpack the restart limits */ + max_n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_local_restarts, + &max_n, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + max_n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->max_global_restarts, + &max_n, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } return ORTE_SUCCESS; diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 8f841fc7c0..a7f246ec62 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -517,6 +517,7 @@ int orte_global_comm(orte_process_name_t *recipient, static void orte_app_context_construct(orte_app_context_t* app_context) { + app_context->name = NULL; app_context->idx=0; app_context->app=NULL; app_context->num_procs=0; @@ -534,10 +535,16 @@ static void orte_app_context_construct(orte_app_context_t* app_context) app_context->preload_files_dest_dir = NULL; app_context->preload_files_src_dir = NULL; app_context->used_on_node = false; + app_context->max_local_restarts = -1; + app_context->max_global_restarts = -1; } static void orte_app_context_destructor(orte_app_context_t* app_context) { + if (NULL != app_context->name) { + free(app_context->name); + } + if (NULL != app_context->app) { free (app_context->app); app_context->app = NULL; @@ -610,6 +617,7 @@ OBJ_CLASS_INSTANCE(orte_app_context_t, static void orte_job_construct(orte_job_t* job) { + job->name = NULL; job->jobid = ORTE_JOBID_INVALID; job->apps = OBJ_NEW(opal_pointer_array_t); opal_pointer_array_init(job->apps, @@ -644,8 +652,6 @@ static void orte_job_construct(orte_job_t* job) job->not_reported = true; job->enable_recovery = false; - job->max_local_restarts = -1; - job->max_global_restarts = -1; job->launch_msg_sent.tv_sec = 0; job->launch_msg_sent.tv_usec = 0; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 251cf93ea8..c33aa88513 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -170,6 +170,11 @@ struct orte_job_map_t; typedef struct { /** Parent object */ opal_object_t super; + /** unique name for this application - has + * nothing to do with argv[0], but has meaning + * to the user, if provided + */ + char *name; /** Unique index when multiple apps per job */ orte_app_idx_t idx; /** Absolute pathname of argv[0] */ @@ -208,6 +213,10 @@ typedef struct { char *preload_files_src_dir; /* is being used on the local node */ bool used_on_node; + /* max number of times a process can be restarted locally */ + int32_t max_local_restarts; + /* max number of times a process can be relocated to another node */ + int32_t max_global_restarts; } orte_app_context_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t); @@ -341,6 +350,8 @@ typedef uint8_t orte_job_controls_t; typedef struct { /** Base object so this can be put on a list */ opal_list_item_t super; + /* a name for this job */ + char *name; /* jobid for this job */ orte_jobid_t jobid; /* app_context array for this job */ @@ -392,14 +403,14 @@ typedef struct { struct orte_proc_t *aborted_proc; /* enable recovery of these processes */ bool enable_recovery; - /* max number of times a process can be restarted locally */ - int32_t max_local_restarts; - /* max number of times a process can be relocated to another node */ - int32_t max_global_restarts; /* time launch message was sent */ struct timeval launch_msg_sent; /* max time for launch msg to be received */ struct timeval max_launch_msg_recvd; + /* uid under which to run the job */ + int32_t uid; + /* gid under which to run the job */ + int32_t gid; #if OPAL_ENABLE_FT_CR == 1 /* ckpt state */ size_t ckpt_state;