1
1

Change the app_idx type to uint32_t to support users who use large numbers of app_contexts. Set it up as a new typedef so we can change it later without as much effort.

This commit was SVN r22727.
Этот коммит содержится в:
Ralph Castain 2010-02-27 17:37:34 +00:00
родитель f4c3cceb5e
Коммит 2541aa98ab
13 изменённых файлов: 68 добавлений и 53 удалений

Просмотреть файл

@ -52,6 +52,10 @@ typedef uint16_t orte_node_rank_t;
#define ORTE_LOCAL_RANK_INVALID UINT16_MAX #define ORTE_LOCAL_RANK_INVALID UINT16_MAX
#define ORTE_NODE_RANK_INVALID UINT16_MAX #define ORTE_NODE_RANK_INVALID UINT16_MAX
/* index for app_contexts */
typedef uint32_t orte_app_idx_t;
#define ORTE_APP_IDX OPAL_UINT32
#define ORTE_APP_IDX_MAX UINT32_MAX
/* /*
* general typedefs & structures * general typedefs & structures

Просмотреть файл

@ -89,9 +89,9 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
orte_job_map_t *map; orte_job_map_t *map;
opal_buffer_t *wireup; opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr; opal_byte_object_t bo, *boptr;
int32_t numbytes, *tmp32; int32_t numbytes, *restarts;
int8_t flag; int8_t flag;
int8_t *tmp; orte_app_idx_t *app_idx;
orte_vpid_t i; orte_vpid_t i;
int j; int j;
orte_daemon_cmd_flag_t command; orte_daemon_cmd_flag_t command;
@ -358,7 +358,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
} }
/* pack the number of app_contexts for this job */ /* pack the number of app_contexts for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_STD_CNTR))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -386,25 +386,25 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
free(bo.bytes); free(bo.bytes);
/* transfer and pack the app_idx and restart arrays for this job */ /* transfer and pack the app_idx and restart arrays for this job */
tmp = (int8_t*)malloc(jdata->num_procs); app_idx = (orte_app_idx_t*)malloc(jdata->num_procs * sizeof(orte_app_idx_t));
tmp32 = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t)); restarts = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t));
for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) { for (j=0, i=0; i < jdata->num_procs && j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue; continue;
} }
tmp[i] = proc->app_idx; app_idx[i] = proc->app_idx;
tmp32[i++] = proc->restarts; restarts[i++] = proc->restarts;
} }
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp, jdata->num_procs, OPAL_INT8))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, app_idx, jdata->num_procs, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
free(tmp); free(app_idx);
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp32, jdata->num_procs, OPAL_INT32))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, restarts, jdata->num_procs, OPAL_INT32))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
free(tmp32); free(restarts);
/* are there cpu_list strings? */ /* are there cpu_list strings? */
if (jdata->map->cpu_lists) { if (jdata->map->cpu_lists) {
@ -579,7 +579,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
opal_buffer_t alert; opal_buffer_t alert;
opal_list_item_t *item; opal_list_item_t *item;
int8_t flag; int8_t flag;
int8_t *app_idx=NULL; orte_app_idx_t *app_idx=NULL;
int32_t *restarts=NULL; int32_t *restarts=NULL;
char **slot_str=NULL; char **slot_str=NULL;
orte_jobid_t debugger; orte_jobid_t debugger;
@ -643,8 +643,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
} }
} }
/* fake an app_idx array */ /* fake an app_idx array */
app_idx = (int8_t*)malloc(jobdat->num_procs * sizeof(int8_t)); app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
memset(app_idx, 0, jobdat->num_procs * sizeof(int8_t)); memset(app_idx, 0, jobdat->num_procs * sizeof(orte_app_idx_t));
/* if we are doing a timing test, store the time the msg was recvd */ /* if we are doing a timing test, store the time the msg was recvd */
if (orte_timing) { if (orte_timing) {
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec; jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
@ -684,8 +684,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, orte_odls_globals.debugger->apps, if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, orte_odls_globals.debugger->apps,
&(orte_odls_globals.debugger->num_apps), ORTE_APP_CONTEXT))) { &cnt, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
@ -803,7 +804,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
} }
/* unpack the number of app_contexts for this job */ /* unpack the number of app_contexts for this job */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_STD_CNTR))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
@ -822,7 +823,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, jobdat->apps, &jobdat->num_apps, ORTE_APP_CONTEXT))) { cnt = jobdat->num_apps;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, jobdat->apps, &cnt, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
@ -846,10 +848,10 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
} }
/* allocate memory for app_idx */ /* allocate memory for app_idx */
app_idx = (int8_t*)malloc(jobdat->num_procs); app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
/* unpack app_idx in one shot */ /* unpack app_idx in one shot */
cnt=jobdat->num_procs; cnt=jobdat->num_procs;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, OPAL_INT8))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
@ -1269,9 +1271,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
char *job_str, *vpid_str, *param, *value; char *job_str, *vpid_str, *param, *value;
opal_list_item_t *item; opal_list_item_t *item;
orte_app_context_t *app, **apps; orte_app_context_t *app, **apps;
orte_std_cntr_t num_apps; orte_app_idx_t i, num_apps;
orte_odls_child_t *child=NULL; orte_odls_child_t *child=NULL;
int i, num_processors; int num_processors;
bool oversubscribed; bool oversubscribed;
int rc=ORTE_SUCCESS, ret; int rc=ORTE_SUCCESS, ret;
bool launch_failed=true; bool launch_failed=true;

Просмотреть файл

@ -79,7 +79,7 @@ static void orte_odls_child_constructor(orte_odls_child_t *ptr)
ptr->name = NULL; ptr->name = NULL;
ptr->restarts = 0; ptr->restarts = 0;
ptr->pid = 0; ptr->pid = 0;
ptr->app_idx = -1; ptr->app_idx = 0;
ptr->alive = false; ptr->alive = false;
ptr->coll_recvd = false; ptr->coll_recvd = false;
/* set the default state to "failed to start" so /* set the default state to "failed to start" so
@ -135,7 +135,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
} }
static void orte_odls_job_destructor(orte_odls_job_t *ptr) static void orte_odls_job_destructor(orte_odls_job_t *ptr)
{ {
orte_std_cntr_t i; orte_app_idx_t i;
OBJ_DESTRUCT(&ptr->lock); OBJ_DESTRUCT(&ptr->lock);
OBJ_DESTRUCT(&ptr->cond); OBJ_DESTRUCT(&ptr->cond);

Просмотреть файл

@ -93,7 +93,7 @@ typedef struct {
orte_process_name_t *name; /* the OmpiRTE name of the proc */ orte_process_name_t *name; /* the OmpiRTE name of the proc */
int32_t restarts; /* number of times this proc has been restarted */ int32_t restarts; /* number of times this proc has been restarted */
pid_t pid; /* local pid of the proc */ pid_t pid; /* local pid of the proc */
orte_std_cntr_t app_idx; /* index of the app_context for this proc */ orte_app_idx_t app_idx; /* index of the app_context for this proc */
bool alive; /* is this proc alive? */ bool alive; /* is this proc alive? */
bool coll_recvd; /* collective operation recvd */ bool coll_recvd; /* collective operation recvd */
orte_proc_state_t state; /* the state of the process */ orte_proc_state_t state; /* the state of the process */
@ -122,7 +122,7 @@ typedef struct orte_odls_job_t {
orte_jobid_t jobid; /* jobid for this data */ orte_jobid_t jobid; /* jobid for this data */
bool launch_msg_processed; /* launch msg has been fully processed */ bool launch_msg_processed; /* launch msg has been fully processed */
orte_app_context_t **apps; /* app_contexts for this job */ orte_app_context_t **apps; /* app_contexts for this job */
orte_std_cntr_t num_apps; /* number of app_contexts */ orte_app_idx_t num_apps; /* number of app_contexts */
orte_mapping_policy_t policy; /* mapping policy */ orte_mapping_policy_t policy; /* mapping policy */
int16_t cpus_per_rank; /* number of cpus/rank */ int16_t cpus_per_rank; /* number of cpus/rank */
int16_t stride; /* step size between cores of multi-core/rank procs */ int16_t stride; /* step size between cores of multi-core/rank procs */

Просмотреть файл

@ -165,7 +165,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
/* pack the number of apps */ /* pack the number of apps */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->num_apps)), 1, ORTE_STD_CNTR))) { (void*)(&(jobs[i]->num_apps)), 1, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }

Просмотреть файл

@ -106,7 +106,8 @@ int orte_dt_size_job(size_t *size, orte_job_t *src, opal_data_type_t type)
{ {
size_t sz; size_t sz;
int32_t i; int32_t i;
orte_app_context_t **apps; orte_app_context_t *app;
orte_proc_t *proc;
/* account for the object itself */ /* account for the object itself */
*size = sizeof(orte_job_t); *size = sizeof(orte_job_t);
@ -114,9 +115,11 @@ int orte_dt_size_job(size_t *size, orte_job_t *src, opal_data_type_t type)
/* if src is NULL, then that's all we wanted */ /* if src is NULL, then that's all we wanted */
if (NULL == src) return ORTE_SUCCESS; if (NULL == src) return ORTE_SUCCESS;
apps = (orte_app_context_t**)src->apps->addr; for (i=0; i < src->apps->size; i++) {
for (i=0; i < src->num_apps; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(src->apps, i))) {
opal_dss.size(&sz, apps[i], ORTE_APP_CONTEXT); continue;
}
opal_dss.size(&sz, app, ORTE_APP_CONTEXT);
*size += sz; *size += sz;
} }
@ -124,10 +127,11 @@ int orte_dt_size_job(size_t *size, orte_job_t *src, opal_data_type_t type)
*size += sz; *size += sz;
for (i=0; i < src->procs->size; i++) { for (i=0; i < src->procs->size; i++) {
if (NULL != src->procs->addr[i]) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(src->procs, i))) {
orte_dt_size_proc(&sz, (orte_proc_t *) src->procs->addr[i], ORTE_PROC); continue;
*size += sz;
} }
orte_dt_size_proc(&sz, proc, ORTE_PROC);
*size += sz;
} }
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1

Просмотреть файл

@ -150,9 +150,10 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type) int32_t *num_vals, opal_data_type_t type)
{ {
int rc; int rc;
int32_t i, j, n, np, nprocs; int32_t i, n, np, nprocs;
orte_job_t **jobs; orte_job_t **jobs;
orte_proc_t *proc; orte_proc_t *proc;
orte_app_idx_t j;
/* unpack into array of orte_job_t objects */ /* unpack into array of orte_job_t objects */
jobs = (orte_job_t**) dest; jobs = (orte_job_t**) dest;
@ -176,7 +177,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
/* unpack the num apps */ /* unpack the num apps */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->num_apps)), &n, ORTE_STD_CNTR))) { (&(jobs[i]->num_apps)), &n, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }

Просмотреть файл

@ -782,7 +782,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->local_rank = ORTE_LOCAL_RANK_INVALID; proc->local_rank = ORTE_LOCAL_RANK_INVALID;
proc->node_rank = ORTE_NODE_RANK_INVALID; proc->node_rank = ORTE_NODE_RANK_INVALID;
proc->state = ORTE_PROC_STATE_UNDEF; proc->state = ORTE_PROC_STATE_UNDEF;
proc->app_idx = -1; proc->app_idx = 0;
proc->slot_list = NULL; proc->slot_list = NULL;
proc->node = NULL; proc->node = NULL;
proc->nodename = NULL; proc->nodename = NULL;

Просмотреть файл

@ -167,7 +167,7 @@ typedef struct {
/** Parent object */ /** Parent object */
opal_object_t super; opal_object_t super;
/** Unique index when multiple apps per job */ /** Unique index when multiple apps per job */
int8_t idx; orte_app_idx_t idx;
/** Absolute pathname of argv[0] */ /** Absolute pathname of argv[0] */
char *app; char *app;
/** Number of copies of this process that are to be launched */ /** Number of copies of this process that are to be launched */
@ -341,7 +341,7 @@ typedef struct {
/* app_context array for this job */ /* app_context array for this job */
opal_pointer_array_t *apps; opal_pointer_array_t *apps;
/* number of app_contexts in the array */ /* number of app_contexts in the array */
orte_std_cntr_t num_apps; orte_app_idx_t num_apps;
/* flags to control the launch of this job - see above /* flags to control the launch of this job - see above
* for description of supported flags * for description of supported flags
*/ */
@ -423,7 +423,7 @@ struct orte_proc_t {
/* exit code */ /* exit code */
orte_exit_code_t exit_code; orte_exit_code_t exit_code;
/* the app_context that generated this proc */ /* the app_context that generated this proc */
int8_t app_idx; orte_app_idx_t app_idx;
/* a cpu list, if specified by the user */ /* a cpu list, if specified by the user */
char *slot_list; char *slot_list;
/* pointer to the node where this proc is executing */ /* pointer to the node where this proc is executing */

Просмотреть файл

@ -560,8 +560,8 @@ static void check_debugger(int fd, short event, void *arg)
void orte_debugger_init_before_spawn(orte_job_t *jdata) void orte_debugger_init_before_spawn(orte_job_t *jdata)
{ {
char *env_name; char *env_name;
orte_app_context_t **apps, *app; orte_app_context_t *app;
orte_std_cntr_t i; int i;
int32_t ljob; int32_t ljob;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) { if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
@ -582,12 +582,14 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
} }
/* tell the procs they are being debugged */ /* tell the procs they are being debugged */
apps = (orte_app_context_t**)jdata->apps->addr;
env_name = mca_base_param_environ_variable("orte", env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL); "in_parallel_debugger", NULL);
for (i=0; i < jdata->num_apps; i++) { for (i=0; i < jdata->apps->size; i++) {
opal_setenv(env_name, "1", true, &apps[i]->env); if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
} }
free(env_name); free(env_name);

Просмотреть файл

@ -67,8 +67,8 @@ int orte_pre_condition_transports(orte_job_t *jdata)
char *cs_env, *string_key = NULL, *format = NULL; char *cs_env, *string_key = NULL, *format = NULL;
uint64_t unique_key[2]; uint64_t unique_key[2];
unsigned int *int_ptr; unsigned int *int_ptr;
orte_std_cntr_t n; int n;
orte_app_context_t **apps; orte_app_context_t *app;
#if !defined(__WINDOWS__) #if !defined(__WINDOWS__)
int fd_rand; int fd_rand;
@ -152,9 +152,11 @@ int orte_pre_condition_transports(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
apps = (orte_app_context_t**)jdata->apps->addr; for (n=0; n < jdata->apps->size; n++) {
for (n=0; n < jdata->num_apps; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
opal_setenv(cs_env, string_key, true, &apps[n]->env); continue;
}
opal_setenv(cs_env, string_key, true, &app->env);
} }
free(cs_env); free(cs_env);

Просмотреть файл

@ -43,7 +43,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .my_hnp = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}, /* .my_hnp = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID},
/* .my_hnp_uri = */ NULL, /* .my_hnp_uri = */ NULL,
/* .hnp_pid = */ 0, /* .hnp_pid = */ 0,
/* .app_num = */ -1, /* .app_num = */ 0,
/* .num_procs = */ 1, /* .num_procs = */ 1,
/* .num_nodes = */ 1, /* .num_nodes = */ 1,
/* .nodename = */ NULL, /* .nodename = */ NULL,
@ -116,7 +116,7 @@ int orte_proc_info(void)
mca_base_param_reg_int_name("orte", "app_num", mca_base_param_reg_int_name("orte", "app_num",
"Index of the app_context that defines this proc", "Index of the app_context that defines this proc",
true, false, -1, &tmp); true, false, 0, &tmp);
orte_process_info.app_num = tmp; orte_process_info.app_num = tmp;
/* get the process id */ /* get the process id */

Просмотреть файл

@ -82,7 +82,7 @@ struct orte_proc_info_t {
orte_process_name_t my_hnp; /**< Name of my hnp */ orte_process_name_t my_hnp; /**< Name of my hnp */
char *my_hnp_uri; /**< Contact info for my hnp */ char *my_hnp_uri; /**< Contact info for my hnp */
pid_t hnp_pid; /**< hnp pid - used if singleton */ pid_t hnp_pid; /**< hnp pid - used if singleton */
orte_std_cntr_t app_num; /**< our index into the app_context array */ orte_app_idx_t app_num; /**< our index into the app_context array */
orte_vpid_t num_procs; /**< number of processes in this job */ orte_vpid_t num_procs; /**< number of processes in this job */
int num_nodes; /**< number of nodes in the job */ int num_nodes; /**< number of nodes in the job */
char *nodename; /**< string name for this node */ char *nodename; /**< string name for this node */