Add the ability to track how many times a process has been restarted, and to communicate that value to a process when it is restarted in case it needs to take action when it is restarted as opposed to being started for the first time.
This commit was SVN r22377.
Этот коммит содержится в:
родитель
f1f285c575
Коммит
ef1bfaa823
@ -89,7 +89,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
orte_job_map_t *map;
|
||||
opal_buffer_t *wireup;
|
||||
opal_byte_object_t bo, *boptr;
|
||||
int32_t numbytes;
|
||||
int32_t numbytes, *tmp32;
|
||||
int8_t flag;
|
||||
int8_t *tmp;
|
||||
orte_vpid_t i;
|
||||
@ -385,19 +385,26 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
|
||||
/* transfer and pack the app_idx array for this job in one pack */
|
||||
/* transfer and pack the app_idx and restart arrays for this job */
|
||||
tmp = (int8_t*)malloc(jdata->num_procs);
|
||||
tmp32 = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
|
||||
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
tmp[i++] = proc->app_idx;
|
||||
tmp[i] = proc->app_idx;
|
||||
tmp32[i++] = proc->restarts;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp, jdata->num_procs, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp32, jdata->num_procs, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(tmp32);
|
||||
|
||||
/* are there cpu_list strings? */
|
||||
if (jdata->map->cpu_lists) {
|
||||
@ -573,6 +580,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
opal_list_item_t *item;
|
||||
int8_t flag;
|
||||
int8_t *app_idx=NULL;
|
||||
int32_t *restarts=NULL;
|
||||
char **slot_str=NULL;
|
||||
orte_jobid_t debugger;
|
||||
bool add_child;
|
||||
@ -846,6 +854,15 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* allocate memory for restarts */
|
||||
restarts = (int32_t*)malloc(jobdat->num_procs * sizeof(int32_t));
|
||||
/* unpack restarts in one shot */
|
||||
cnt=jobdat->num_procs;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, restarts, &cnt, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* unpack flag to indicate if slot_strings are present */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
@ -909,6 +926,7 @@ find_my_procs:
|
||||
ORTE_NAME_PRINT(&proc),
|
||||
(child->alive) ? "ALIVE" : "DEAD"));
|
||||
add_child = false;
|
||||
child->restarts = restarts[j];
|
||||
/* mark that this app_context is being used on this node */
|
||||
jobdat->apps[app_idx[j]]->used_on_node = true;
|
||||
break;
|
||||
@ -928,6 +946,7 @@ find_my_procs:
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
child->app_idx = app_idx[j]; /* save the index into the app_context objects */
|
||||
child->restarts = restarts[j];
|
||||
if (NULL != slot_str && NULL != slot_str[j]) {
|
||||
child->slot_list = strdup(slot_str[j]);
|
||||
}
|
||||
@ -949,6 +968,10 @@ find_my_procs:
|
||||
free(app_idx);
|
||||
app_idx = NULL;
|
||||
}
|
||||
if (NULL != restarts) {
|
||||
free(restarts);
|
||||
restarts = NULL;
|
||||
}
|
||||
if (NULL != slot_str) {
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
free(slot_str[j]);
|
||||
@ -989,6 +1012,10 @@ REPORT_ERROR:
|
||||
free(app_idx);
|
||||
app_idx = NULL;
|
||||
}
|
||||
if (NULL != restarts) {
|
||||
free(restarts);
|
||||
restarts = NULL;
|
||||
}
|
||||
if (NULL != slot_str && NULL != jobdat) {
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
free(slot_str[j]);
|
||||
@ -1748,6 +1775,20 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* pass the number of restarts for this proc - will be zero for
|
||||
* an initial start, but procs would like to know if they are being
|
||||
* restarted so they can take appropriate action
|
||||
*/
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
asprintf(&value, "%d", child->restarts);
|
||||
opal_setenv(param, value, true, &app->env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* if the proc isn't going to forward IO, then we need to flag that
|
||||
* it has "completed" iof termination as otherwise it will never fire
|
||||
*/
|
||||
|
@ -75,6 +75,7 @@ orte_odls_base_module_t orte_odls;
|
||||
static void orte_odls_child_constructor(orte_odls_child_t *ptr)
|
||||
{
|
||||
ptr->name = NULL;
|
||||
ptr->restarts = 0;
|
||||
ptr->pid = 0;
|
||||
ptr->app_idx = -1;
|
||||
ptr->alive = false;
|
||||
|
@ -85,6 +85,7 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
typedef struct {
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_process_name_t *name; /* the OmpiRTE name of the proc */
|
||||
int32_t restarts; /* number of times this proc has been restarted */
|
||||
pid_t pid; /* local pid of the proc */
|
||||
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
|
||||
bool alive; /* is this proc alive? */
|
||||
|
@ -484,6 +484,13 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of restarts */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)&(procs[i]->restarts), 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* pack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
|
@ -250,7 +250,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
tmp = tmp2;
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%sNum procs: %ld", tmp, pfx, (long)src->num_procs);
|
||||
asprintf(&tmp2, "%s\n%sNum procs: %ld\tMax Restarts: %d", tmp, pfx, (long)src->num_procs, src->max_restarts);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
@ -534,8 +534,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tState: %0x\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
|
||||
src->state, (long)src->app_idx,
|
||||
asprintf(&tmp2, "%s\n%s\tState: %0x\tRestarts: %d\tApp_context: %ld\tSlot list: %s", tmp, pfx2,
|
||||
src->state, src->restarts, (long)src->app_idx,
|
||||
(NULL == src->slot_list) ? "NULL" : src->slot_list);
|
||||
free(tmp);
|
||||
|
||||
|
@ -513,6 +513,14 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the number of restarts */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(procs[i]->restarts)), &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* unpack the ckpt state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
|
@ -597,6 +597,7 @@ static void orte_job_construct(orte_job_t* job)
|
||||
job->err_cbfunc = NULL;
|
||||
job->err_cbstates = ORTE_PROC_STATE_UNDEF;
|
||||
job->err_cbdata = NULL;
|
||||
job->max_restarts = INT32_MAX;
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
job->ckpt_state = 0;
|
||||
@ -779,6 +780,7 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->nodename = NULL;
|
||||
proc->rml_uri = NULL;
|
||||
proc->beat = 0;
|
||||
proc->restarts = 0;
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
proc->ckpt_state = 0;
|
||||
proc->ckpt_snapshot_ref = NULL;
|
||||
|
@ -385,6 +385,8 @@ typedef struct {
|
||||
orte_proc_state_t err_cbstates;
|
||||
/* errmgr callback data */
|
||||
void *err_cbdata;
|
||||
/* max number of times a process can be restarted */
|
||||
int32_t max_restarts;
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* ckpt state */
|
||||
size_t ckpt_state;
|
||||
@ -435,6 +437,8 @@ struct orte_proc_t {
|
||||
char *rml_uri;
|
||||
/* seconds when last heartbeat was detected */
|
||||
int beat;
|
||||
/* number of times this process has been restarted */
|
||||
int32_t restarts;
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* ckpt state */
|
||||
size_t ckpt_state;
|
||||
|
@ -15,8 +15,8 @@
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int rc;
|
||||
char hostname[512];
|
||||
int rc, restart;
|
||||
char hostname[512], *rstrt;
|
||||
pid_t pid;
|
||||
|
||||
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
|
||||
@ -24,11 +24,15 @@ int main(int argc, char* argv[])
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (NULL != (rstrt = getenv("OMPI_MCA_orte_num_restarts"))) {
|
||||
restart = strtol(rstrt, NULL, 10);
|
||||
}
|
||||
|
||||
gethostname(hostname, 512);
|
||||
pid = getpid();
|
||||
|
||||
printf("orte_nodename: Node %s Name %s Pid %ld\n",
|
||||
hostname, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid);
|
||||
printf("orte_nodename: Node %s Name %s Pid %ld Restarts: %d\n",
|
||||
hostname, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid, restart);
|
||||
|
||||
orte_finalize();
|
||||
return 0;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user