1
1

Add recovery data to the jobdat object

This commit was SVN r23078.
Этот коммит содержится в:
Ralph Castain 2010-05-03 04:06:13 +00:00
родитель 323224b84b
Коммит f994a7edf4
4 изменённых файлов: 120 добавлений и 93 удалений

Просмотреть файл

@ -367,6 +367,12 @@ pack_add_procs:
return rc;
}
/* pack whether or not process recovery is allowed for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->enable_recovery, 1, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the max number of local restarts allowed for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->max_local_restarts, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
@ -819,6 +825,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack whether or not process recovery is allowed for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->enable_recovery, &cnt, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the max number of local restarts allowed for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->max_local_restarts, &cnt, ORTE_VPID))) {
@ -2911,6 +2923,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
opal_output(0, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app);
rc = fork_local(app, child, app->env, jobdat);
if (ORTE_SUCCESS == rc) {
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);

Просмотреть файл

@ -73,99 +73,6 @@ int orte_odls_base_open(void)
*/
orte_odls_base_module_t orte_odls;
/* instance the child list object */
static void orte_odls_child_constructor(orte_odls_child_t *ptr)
{
ptr->name = NULL;
ptr->restarts = 0;
ptr->pid = 0;
ptr->app_idx = 0;
ptr->alive = false;
ptr->coll_recvd = false;
/* set the default state to "failed to start" so
* we can correctly report should something
* go wrong during launch
*/
ptr->state = ORTE_PROC_STATE_FAILED_TO_START;
ptr->exit_code = 0;
ptr->init_recvd = false;
ptr->fini_recvd = false;
ptr->rml_uri = NULL;
ptr->slot_list = NULL;
ptr->waitpid_recvd = false;
ptr->iof_complete = false;
ptr->do_not_barrier = false;
}
static void orte_odls_child_destructor(orte_odls_child_t *ptr)
{
if (NULL != ptr->name) free(ptr->name);
if (NULL != ptr->rml_uri) free(ptr->rml_uri);
if (NULL != ptr->slot_list) free(ptr->slot_list);
}
OBJ_CLASS_INSTANCE(orte_odls_child_t,
opal_list_item_t,
orte_odls_child_constructor,
orte_odls_child_destructor);
static void orte_odls_job_constructor(orte_odls_job_t *ptr)
{
OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t);
OBJ_CONSTRUCT(&ptr->cond, opal_condition_t);
ptr->jobid = ORTE_JOBID_INVALID;
ptr->state = ORTE_JOB_STATE_UNDEF;
ptr->launch_msg_processed = false;
ptr->apps = NULL;
ptr->num_apps = 0;
ptr->policy = 0;
ptr->cpus_per_rank = 1;
ptr->stride = 1;
ptr->controls = 0;
ptr->stdin_target = ORTE_VPID_INVALID;
ptr->total_slots_alloc = 0;
ptr->num_procs = 0;
ptr->num_local_procs = 0;
ptr->regexp = NULL;
ptr->pmap = NULL;
OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t);
OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t);
ptr->collective_type = ORTE_GRPCOMM_COLL_NONE;
ptr->num_contributors = 0;
ptr->num_participating = -1;
ptr->num_collected = 0;
ptr->max_local_restarts = 0;
}
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
{
orte_app_idx_t i;
OBJ_DESTRUCT(&ptr->lock);
OBJ_DESTRUCT(&ptr->cond);
if (NULL != ptr->apps) {
for (i=0; i < ptr->num_apps; i++) {
OBJ_RELEASE(ptr->apps[i]);
}
if (NULL != ptr->apps) {
free(ptr->apps);
}
}
if (NULL != ptr->regexp) {
free(ptr->regexp);
}
if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) {
free(ptr->pmap->bytes);
free(ptr->pmap);
}
OBJ_DESTRUCT(&ptr->collection_bucket);
OBJ_DESTRUCT(&ptr->local_collection);
}
OBJ_CLASS_INSTANCE(orte_odls_job_t,
opal_list_item_t,
orte_odls_job_constructor,
orte_odls_job_destructor);
/*
* Framework global variables
*/
@ -346,4 +253,99 @@ int orte_odls_base_open(void)
return ORTE_SUCCESS;
}
/* instance the child list object */
static void orte_odls_child_constructor(orte_odls_child_t *ptr)
{
ptr->name = NULL;
ptr->restarts = 0;
ptr->pid = 0;
ptr->app_idx = 0;
ptr->alive = false;
ptr->coll_recvd = false;
/* set the default state to "failed to start" so
* we can correctly report should something
* go wrong during launch
*/
ptr->state = ORTE_PROC_STATE_FAILED_TO_START;
ptr->exit_code = 0;
ptr->init_recvd = false;
ptr->fini_recvd = false;
ptr->rml_uri = NULL;
ptr->slot_list = NULL;
ptr->waitpid_recvd = false;
ptr->iof_complete = false;
ptr->do_not_barrier = false;
}
static void orte_odls_child_destructor(orte_odls_child_t *ptr)
{
if (NULL != ptr->name) free(ptr->name);
if (NULL != ptr->rml_uri) free(ptr->rml_uri);
if (NULL != ptr->slot_list) free(ptr->slot_list);
}
OBJ_CLASS_INSTANCE(orte_odls_child_t,
opal_list_item_t,
orte_odls_child_constructor,
orte_odls_child_destructor);
static void orte_odls_job_constructor(orte_odls_job_t *ptr)
{
OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t);
OBJ_CONSTRUCT(&ptr->cond, opal_condition_t);
ptr->jobid = ORTE_JOBID_INVALID;
ptr->state = ORTE_JOB_STATE_UNDEF;
ptr->launch_msg_processed = false;
ptr->apps = NULL;
ptr->num_apps = 0;
ptr->policy = 0;
ptr->cpus_per_rank = 1;
ptr->stride = 1;
ptr->controls = 0;
ptr->stdin_target = ORTE_VPID_INVALID;
ptr->total_slots_alloc = 0;
ptr->num_procs = 0;
ptr->num_local_procs = 0;
ptr->regexp = NULL;
ptr->pmap = NULL;
OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t);
OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t);
ptr->collective_type = ORTE_GRPCOMM_COLL_NONE;
ptr->num_contributors = 0;
ptr->num_participating = -1;
ptr->num_collected = 0;
ptr->enable_recovery = false;
ptr->max_local_restarts = 0;
}
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
{
orte_app_idx_t i;
OBJ_DESTRUCT(&ptr->lock);
OBJ_DESTRUCT(&ptr->cond);
if (NULL != ptr->apps) {
for (i=0; i < ptr->num_apps; i++) {
OBJ_RELEASE(ptr->apps[i]);
}
if (NULL != ptr->apps) {
free(ptr->apps);
}
}
if (NULL != ptr->regexp) {
free(ptr->regexp);
}
if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) {
free(ptr->pmap->bytes);
free(ptr->pmap);
}
OBJ_DESTRUCT(&ptr->collection_bucket);
OBJ_DESTRUCT(&ptr->local_collection);
}
OBJ_CLASS_INSTANCE(orte_odls_job_t,
opal_list_item_t,
orte_odls_job_constructor,
orte_odls_job_destructor);
#endif

Просмотреть файл

@ -876,6 +876,17 @@ LAUNCH_PROCS:
/* Exec the new executable */
if (10 < opal_output_get_verbosity(orte_odls_globals.output)) {
int jout;
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app);
for (jout=0; NULL != context->argv[jout]; jout++) {
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]);
}
for (jout=0; NULL != environ_copy[jout]; jout++) {
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
}
}
execve(context->app, context->argv, environ_copy);
orte_show_help("help-odls-default.txt", "orte-odls-default:execv-error",
true, context->app, strerror(errno));

Просмотреть файл

@ -143,6 +143,7 @@ typedef struct orte_odls_job_t {
int num_participating;
int num_collected;
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
bool enable_recovery; /* enable recovery of failed processes */
int32_t max_local_restarts; /* max number of times a local proc can be restarted */
} orte_odls_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);