1
1

Change the app_idx type to uint32_t to support users who use large numbers of app_contexts. Set it up as a new typedef so we can change it later without as much effort.

This commit was SVN r22727.
Этот коммит содержится в:
Ralph Castain 2010-02-27 17:37:34 +00:00
родитель f4c3cceb5e
Коммит 2541aa98ab
13 изменённых файлов: 68 добавлений и 53 удалений

Просмотреть файл

@ -52,6 +52,10 @@ typedef uint16_t orte_node_rank_t;
#define ORTE_LOCAL_RANK_INVALID UINT16_MAX
#define ORTE_NODE_RANK_INVALID UINT16_MAX
/* index for app_contexts */
typedef uint32_t orte_app_idx_t;
#define ORTE_APP_IDX OPAL_UINT32
#define ORTE_APP_IDX_MAX UINT32_MAX
/*
* general typedefs & structures

Просмотреть файл

@ -89,9 +89,9 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
orte_job_map_t *map;
opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr;
int32_t numbytes, *tmp32;
int32_t numbytes, *restarts;
int8_t flag;
int8_t *tmp;
orte_app_idx_t *app_idx;
orte_vpid_t i;
int j;
orte_daemon_cmd_flag_t command;
@ -358,7 +358,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
}
/* pack the number of app_contexts for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -386,25 +386,25 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
free(bo.bytes);
/* transfer and pack the app_idx and restart arrays for this job */
tmp = (int8_t*)malloc(jdata->num_procs);
tmp32 = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
app_idx = (orte_app_idx_t*)malloc(jdata->num_procs * sizeof(orte_app_idx_t));
restarts = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
tmp[i] = proc->app_idx;
tmp32[i++] = proc->restarts;
app_idx[i] = proc->app_idx;
restarts[i++] = proc->restarts;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp, jdata->num_procs, OPAL_INT8))) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, app_idx, jdata->num_procs, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp);
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp32, jdata->num_procs, OPAL_INT32))) {
free(app_idx);
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, restarts, jdata->num_procs, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp32);
free(restarts);
/* are there cpu_list strings? */
if (jdata->map->cpu_lists) {
@ -579,7 +579,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
opal_buffer_t alert;
opal_list_item_t *item;
int8_t flag;
int8_t *app_idx=NULL;
orte_app_idx_t *app_idx=NULL;
int32_t *restarts=NULL;
char **slot_str=NULL;
orte_jobid_t debugger;
@ -643,8 +643,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
}
}
/* fake an app_idx array */
app_idx = (int8_t*)malloc(jobdat->num_procs * sizeof(int8_t));
memset(app_idx, 0, jobdat->num_procs * sizeof(int8_t));
app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
memset(app_idx, 0, jobdat->num_procs * sizeof(orte_app_idx_t));
/* if we are doing a timing test, store the time the msg was recvd */
if (orte_timing) {
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
@ -684,8 +684,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto REPORT_ERROR;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, orte_odls_globals.debugger->apps,
&(orte_odls_globals.debugger->num_apps), ORTE_APP_CONTEXT))) {
&cnt, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
@ -803,7 +804,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
}
/* unpack the number of app_contexts for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
@ -822,7 +823,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto REPORT_ERROR;
}
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, jobdat->apps, &jobdat->num_apps, ORTE_APP_CONTEXT))) {
cnt = jobdat->num_apps;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, jobdat->apps, &cnt, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
@ -846,10 +848,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
}
/* allocate memory for app_idx */
app_idx = (int8_t*)malloc(jobdat->num_procs);
app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
/* unpack app_idx in one shot */
cnt=jobdat->num_procs;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, OPAL_INT8))) {
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
@ -1269,9 +1271,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
char *job_str, *vpid_str, *param, *value;
opal_list_item_t *item;
orte_app_context_t *app, **apps;
orte_std_cntr_t num_apps;
orte_app_idx_t i, num_apps;
orte_odls_child_t *child=NULL;
int i, num_processors;
int num_processors;
bool oversubscribed;
int rc=ORTE_SUCCESS, ret;
bool launch_failed=true;

Просмотреть файл

@ -79,7 +79,7 @@ static void orte_odls_child_constructor(orte_odls_child_t *ptr)
ptr->name = NULL;
ptr->restarts = 0;
ptr->pid = 0;
ptr->app_idx = -1;
ptr->app_idx = 0;
ptr->alive = false;
ptr->coll_recvd = false;
/* set the default state to "failed to start" so
@ -135,7 +135,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
}
static void orte_odls_job_destructor(orte_odls_job_t *ptr)
{
orte_std_cntr_t i;
orte_app_idx_t i;
OBJ_DESTRUCT(&ptr->lock);
OBJ_DESTRUCT(&ptr->cond);

Просмотреть файл

@ -93,7 +93,7 @@ typedef struct {
orte_process_name_t *name; /* the OmpiRTE name of the proc */
int32_t restarts; /* number of times this proc has been restarted */
pid_t pid; /* local pid of the proc */
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
orte_app_idx_t app_idx; /* index of the app_context for this proc */
bool alive; /* is this proc alive? */
bool coll_recvd; /* collective operation recvd */
orte_proc_state_t state; /* the state of the process */
@ -122,7 +122,7 @@ typedef struct orte_odls_job_t {
orte_jobid_t jobid; /* jobid for this data */
bool launch_msg_processed; /* launch msg has been fully processed */
orte_app_context_t **apps; /* app_contexts for this job */
orte_std_cntr_t num_apps; /* number of app_contexts */
orte_app_idx_t num_apps; /* number of app_contexts */
orte_mapping_policy_t policy; /* mapping policy */
int16_t cpus_per_rank; /* number of cpus/rank */
int16_t stride; /* step size between cores of multi-core/rank procs */

Просмотреть файл

@ -165,7 +165,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
/* pack the number of apps */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->num_apps)), 1, ORTE_STD_CNTR))) {
(void*)(&(jobs[i]->num_apps)), 1, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -106,7 +106,8 @@ int orte_dt_size_job(size_t *size, orte_job_t *src, opal_data_type_t type)
{
size_t sz;
int32_t i;
orte_app_context_t **apps;
orte_app_context_t *app;
orte_proc_t *proc;
/* account for the object itself */
*size = sizeof(orte_job_t);
@ -114,9 +115,11 @@ int orte_dt_size_job(size_t *size, orte_job_t *src, opal_data_type_t type)
/* if src is NULL, then that's all we wanted */
if (NULL == src) return ORTE_SUCCESS;
apps = (orte_app_context_t**)src->apps->addr;
for (i=0; i < src->num_apps; i++) {
opal_dss.size(&sz, apps[i], ORTE_APP_CONTEXT);
for (i=0; i < src->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(src->apps, i))) {
continue;
}
opal_dss.size(&sz, app, ORTE_APP_CONTEXT);
*size += sz;
}
@ -124,10 +127,11 @@ int orte_dt_size_job(size_t *size, orte_job_t *src, opal_data_type_t type)
*size += sz;
for (i=0; i < src->procs->size; i++) {
if (NULL != src->procs->addr[i]) {
orte_dt_size_proc(&sz, (orte_proc_t *) src->procs->addr[i], ORTE_PROC);
*size += sz;
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(src->procs, i))) {
continue;
}
orte_dt_size_proc(&sz, proc, ORTE_PROC);
*size += sz;
}
#if OPAL_ENABLE_FT == 1

Просмотреть файл

@ -150,9 +150,10 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
int32_t i, j, n, np, nprocs;
int32_t i, n, np, nprocs;
orte_job_t **jobs;
orte_proc_t *proc;
orte_app_idx_t j;
/* unpack into array of orte_job_t objects */
jobs = (orte_job_t**) dest;
@ -176,7 +177,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
/* unpack the num apps */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->num_apps)), &n, ORTE_STD_CNTR))) {
(&(jobs[i]->num_apps)), &n, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -782,7 +782,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->local_rank = ORTE_LOCAL_RANK_INVALID;
proc->node_rank = ORTE_NODE_RANK_INVALID;
proc->state = ORTE_PROC_STATE_UNDEF;
proc->app_idx = -1;
proc->app_idx = 0;
proc->slot_list = NULL;
proc->node = NULL;
proc->nodename = NULL;

Просмотреть файл

@ -167,7 +167,7 @@ typedef struct {
/** Parent object */
opal_object_t super;
/** Unique index when multiple apps per job */
int8_t idx;
orte_app_idx_t idx;
/** Absolute pathname of argv[0] */
char *app;
/** Number of copies of this process that are to be launched */
@ -341,7 +341,7 @@ typedef struct {
/* app_context array for this job */
opal_pointer_array_t *apps;
/* number of app_contexts in the array */
orte_std_cntr_t num_apps;
orte_app_idx_t num_apps;
/* flags to control the launch of this job - see above
* for description of supported flags
*/
@ -423,7 +423,7 @@ struct orte_proc_t {
/* exit code */
orte_exit_code_t exit_code;
/* the app_context that generated this proc */
int8_t app_idx;
orte_app_idx_t app_idx;
/* a cpu list, if specified by the user */
char *slot_list;
/* pointer to the node where this proc is executing */

Просмотреть файл

@ -560,8 +560,8 @@ static void check_debugger(int fd, short event, void *arg)
void orte_debugger_init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t **apps, *app;
orte_std_cntr_t i;
orte_app_context_t *app;
int i;
int32_t ljob;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
@ -582,12 +582,14 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
}
/* tell the procs they are being debugged */
apps = (orte_app_context_t**)jdata->apps->addr;
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->num_apps; i++) {
opal_setenv(env_name, "1", true, &apps[i]->env);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);

Просмотреть файл

@ -67,8 +67,8 @@ int orte_pre_condition_transports(orte_job_t *jdata)
char *cs_env, *string_key = NULL, *format = NULL;
uint64_t unique_key[2];
unsigned int *int_ptr;
orte_std_cntr_t n;
orte_app_context_t **apps;
int n;
orte_app_context_t *app;
#if !defined(__WINDOWS__)
int fd_rand;
@ -152,9 +152,11 @@ int orte_pre_condition_transports(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
apps = (orte_app_context_t**)jdata->apps->addr;
for (n=0; n < jdata->num_apps; n++) {
opal_setenv(cs_env, string_key, true, &apps[n]->env);
for (n=0; n < jdata->apps->size; n++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
continue;
}
opal_setenv(cs_env, string_key, true, &app->env);
}
free(cs_env);

Просмотреть файл

@ -42,8 +42,8 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .my_daemon_uri = */ NULL,
/* .my_hnp = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID},
/* .my_hnp_uri = */ NULL,
/* .hnp_pid = */ 0,
/* .app_num = */ -1,
/* .hnp_pid = */ 0,
/* .app_num = */ 0,
/* .num_procs = */ 1,
/* .num_nodes = */ 1,
/* .nodename = */ NULL,
@ -116,7 +116,7 @@ int orte_proc_info(void)
mca_base_param_reg_int_name("orte", "app_num",
"Index of the app_context that defines this proc",
true, false, -1, &tmp);
true, false, 0, &tmp);
orte_process_info.app_num = tmp;
/* get the process id */

Просмотреть файл

@ -82,7 +82,7 @@ struct orte_proc_info_t {
orte_process_name_t my_hnp; /**< Name of my hnp */
char *my_hnp_uri; /**< Contact info for my hnp */
pid_t hnp_pid; /**< hnp pid - used if singleton */
orte_std_cntr_t app_num; /**< our index into the app_context array */
orte_app_idx_t app_num; /**< our index into the app_context array */
orte_vpid_t num_procs; /**< number of processes in this job */
int num_nodes; /**< number of nodes in the job */
char *nodename; /**< string name for this node */