1
1

MOdify the node_rank and local_rank fields to be uint16_t so we can handle more than 256 procs/node. Change the type to a defined one so that any future change can be easily done, if required.

This commit was SVN r19637.
Этот коммит содержится в:
Ralph Castain 2008-09-25 13:39:08 +00:00
родитель 55738aeabe
Коммит 037231fbcb
18 изменённых файлов: 286 добавлений и 118 удалений

Просмотреть файл

@ -37,6 +37,20 @@ typedef int32_t orte_std_cntr_t; /** standard counters used in ORTE */
#define ORTE_STD_CNTR_MIN INT32_MIN
#define ORTE_STD_CNTR_INVALID -1
/** rank on node, used for both local and node rank. We
* don't send these around on their own, so don't create
* dedicated type support for them - we are defining them
* here solely for readability in the code and so we have
* one place where any future changes can be made
*/
typedef uint16_t orte_local_rank_t;
typedef uint16_t orte_node_rank_t;
#define ORTE_LOCAL_RANK OPAL_UINT16
#define ORTE_NODE_RANK OPAL_UINT16
#define ORTE_LOCAL_RANK_MAX UINT16_MAX
#define ORTE_NODE_RANK_MAX UINT16_MAX
/*
* general typedefs & structures
*/

Просмотреть файл

@ -42,8 +42,8 @@ static int rte_finalize(void);
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
orte_ess_base_module_t orte_ess_alps_module = {
@ -257,7 +257,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
@ -275,7 +275,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return pmap->local_rank;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;

Просмотреть файл

@ -41,8 +41,8 @@ static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
orte_ess_base_module_t orte_ess_cnos_module = {
@ -149,7 +149,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
/* RHC: someone more familiar with CNOS needs to
* fix this to return the correct value
@ -157,7 +157,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return 0;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
/* RHC: someone more familiar with CNOS needs to
* fix this to return the correct value

8
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -83,8 +83,8 @@ static int rte_finalize(void);
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
#if OPAL_ENABLE_FT == 1
@ -309,7 +309,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
@ -327,7 +327,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return pmap->local_rank;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;

Просмотреть файл

@ -98,12 +98,12 @@ typedef uint32_t (*orte_ess_base_module_proc_get_arch_fn_t)(orte_process_name_t
/**
* Get the local rank of a remote process
*/
typedef uint8_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_process_name_t *proc);
typedef orte_local_rank_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_process_name_t *proc);
/**
* Get the node rank of a remote process
*/
typedef uint8_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc);
typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc);
/**
* Update the arch of a remote process

Просмотреть файл

@ -52,8 +52,8 @@ static int rte_finalize(void);
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
orte_ess_base_module_t orte_ess_lsf_module = {
@ -269,7 +269,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
@ -287,7 +287,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return pmap->local_rank;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;

Просмотреть файл

@ -41,8 +41,8 @@ static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
orte_ess_base_module_t orte_ess_portals_utcp_module = {
@ -161,7 +161,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
/* RHC: someone more familiar with CNOS needs to
* fix this to return the correct value
@ -169,7 +169,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return 0;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
/* RHC: someone more familiar with CNOS needs to
* fix this to return the correct value

Просмотреть файл

@ -70,8 +70,8 @@ static int rte_finalize(void);
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
orte_ess_base_module_t orte_ess_singleton_module = {
@ -501,7 +501,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
@ -519,7 +519,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return pmap->local_rank;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;

Просмотреть файл

@ -52,8 +52,8 @@ static int rte_finalize(void);
static bool proc_is_local(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
static uint32_t proc_get_arch(orte_process_name_t *proc);
static uint8_t proc_get_local_rank(orte_process_name_t *proc);
static uint8_t proc_get_node_rank(orte_process_name_t *proc);
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc);
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc);
static int update_arch(orte_process_name_t *proc, uint32_t arch);
@ -269,7 +269,7 @@ static int update_arch(orte_process_name_t *proc, uint32_t arch)
return ORTE_SUCCESS;
}
static uint8_t proc_get_local_rank(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
@ -287,7 +287,7 @@ static uint8_t proc_get_local_rank(orte_process_name_t *proc)
return pmap->local_rank;
}
static uint8_t proc_get_node_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;

Просмотреть файл

@ -1375,6 +1375,8 @@ static void setup_singleton_jobdat(orte_jobid_t jobid)
orte_pmap_t pmap;
int32_t one32;
int8_t one8;
orte_local_rank_t lrank;
orte_node_rank_t nrank;
opal_buffer_t buffer;
int rc;
@ -1391,9 +1393,11 @@ static void setup_singleton_jobdat(orte_jobid_t jobid)
opal_dss.pack(&buffer, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID); /* num_procs */
one32 = 0;
opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */
lrank = 0;
opal_dss.pack(&buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */
nrank = 0;
opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */
one8 = 0;
opal_dss.pack(&buffer, &one8, 1, OPAL_UINT8); /* local rank */
opal_dss.pack(&buffer, &one8, 1, OPAL_UINT8); /* node rank */
opal_dss.pack(&buffer, &one8, 1, OPAL_INT8); /* app_idx */
jobdat->pmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
opal_dss.unload(&buffer, (void**)&jobdat->pmap->bytes, &jobdat->pmap->size);

Просмотреть файл

@ -130,7 +130,6 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
orte_job_map_t *map = NULL;
orte_app_context_t **apps;
orte_node_t **nodes;
int node_name_index;
int proc_vpid_index;
char *param;
char **env = NULL;
@ -204,7 +203,6 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
&node_name_index,
true);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
@ -294,10 +292,6 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
continue;
}
/* setup node name */
free(argv[node_name_index]);
argv[node_name_index] = strdup(node->name);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tmd: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -357,7 +357,7 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
orte_node_t **nodes;
orte_proc_t **procs, *psave, *psave2;
orte_vpid_t minv, minv2;
uint8_t local_rank;
orte_local_rank_t local_rank;
orte_job_map_t *map;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
@ -387,13 +387,13 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
/* find the minimum vpid proc */
for (j=0; j < nodes[i]->num_procs; j++) {
if (procs[j]->name.jobid == jdata->jobid &&
UINT8_MAX == procs[j]->local_rank &&
ORTE_LOCAL_RANK_MAX == procs[j]->local_rank &&
procs[j]->name.vpid < minv) {
minv = procs[j]->name.vpid;
psave = procs[j];
}
/* no matter what job...still have to handle node_rank */
if (UINT8_MAX == procs[j]->node_rank &&
if (ORTE_NODE_RANK_MAX == procs[j]->node_rank &&
procs[j]->name.vpid < minv2) {
minv2 = procs[j]->name.vpid;
psave2 = procs[j];

Просмотреть файл

@ -405,14 +405,14 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
/* pack the local rank */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->local_rank)), 1, OPAL_UINT8))) {
(void*)(&(procs[i]->local_rank)), 1, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the node rank */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->node_rank)), 1, OPAL_UINT8))) {
(void*)(&(procs[i]->node_rank)), 1, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -441,8 +441,8 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
asprintf(&tmp, "\n%sData for proc: %s", pfx2, ORTE_NAME_PRINT(&src->name));
asprintf(&tmp2, "%s\n%s\tPid: %ld\tLocal rank: %ld\tNode rank: %ld", tmp, pfx2,
(long)src->pid, (long)src->local_rank, (long)src->node_rank);
asprintf(&tmp2, "%s\n%s\tPid: %ld\tLocal rank: %lu\tNode rank: %lu", tmp, pfx2,
(long)src->pid, (unsigned long)src->local_rank, (unsigned long)src->node_rank);
free(tmp);
tmp = tmp2;

Просмотреть файл

@ -442,7 +442,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
/* unpack the local rank */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->local_rank)), &n, OPAL_UINT8))) {
(&(procs[i]->local_rank)), &n, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -450,7 +450,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
/* unpack the local rank */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->node_rank)), &n, OPAL_UINT8))) {
(&(procs[i]->node_rank)), &n, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -600,8 +600,8 @@ static void orte_proc_construct(orte_proc_t* proc)
{
proc->name = *ORTE_NAME_INVALID;
proc->pid = 0;
proc->local_rank = UINT8_MAX;
proc->node_rank = UINT8_MAX;
proc->local_rank = ORTE_LOCAL_RANK_MAX;
proc->node_rank = ORTE_NODE_RANK_MAX;
proc->state = ORTE_PROC_STATE_UNDEF;
proc->app_idx = -1;
proc->slot_list = NULL;

Просмотреть файл

@ -185,7 +185,7 @@ typedef struct {
/* array of pointers to procs on this node */
opal_pointer_array_t *procs;
/* next node rank on this node */
uint8_t next_node_rank;
orte_node_rank_t next_node_rank;
/* whether or not we are oversubscribed */
bool oversubscribed;
/** The node architecture, as reported by the remote node. This
@ -226,12 +226,12 @@ typedef struct {
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
/* define a set of flags to control the launch of a job */
#define ORTE_JOB_CONTROL_LOCAL_SPAWN (uint16_t) 0x01
#define ORTE_JOB_CONTROL_NON_ORTE_JOB (uint16_t) 0x02
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON (uint16_t) 0x04
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT (uint16_t) 0x08
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR (uint16_t) 0x10
#define ORTE_JOB_CONTROL_FORWARD_COMM (uint16_t) 0x20
#define ORTE_JOB_CONTROL_LOCAL_SPAWN (uint16_t) 0x0001
#define ORTE_JOB_CONTROL_NON_ORTE_JOB (uint16_t) 0x0002
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON (uint16_t) 0x0004
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT (uint16_t) 0x0008
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR (uint16_t) 0x0010
#define ORTE_JOB_CONTROL_FORWARD_COMM (uint16_t) 0x0020
typedef struct {
/** Base object so this can be put on a list */
@ -299,13 +299,13 @@ struct orte_proc_t {
* rank on a node can perform certain fns -
* e.g., open an sm backing file
*/
uint8_t local_rank;
orte_local_rank_t local_rank;
/* local rank on the node across all procs
* and jobs known to this HNP - this is
* needed so that procs can do things like
* know which static IP port to use
*/
uint8_t node_rank;
orte_node_rank_t node_rank;
/* process state */
orte_proc_state_t state;
/* exit code */
@ -354,9 +354,9 @@ typedef struct {
/* index to node */
int32_t node;
/* local rank */
uint8_t local_rank;
orte_local_rank_t local_rank;
/* node rank */
uint8_t node_rank;
orte_node_rank_t node_rank;
} orte_pmap_t;
typedef struct {

Просмотреть файл

@ -60,7 +60,10 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
++num_nodes;
}
/* pack number of nodes */
opal_dss.pack(&buf, &num_nodes, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_nodes, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the HNP's node name - don't mess with
* trying to encode it - it could be different
@ -74,10 +77,16 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0';
}
opal_dss.pack(&buf, &nodename, 1, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(nodename);
} else {
opal_dss.pack(&buf, &nodes[0]->name, 1, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodes[0]->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* see if the cluster is configured with contiguous
@ -114,18 +123,30 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
/* begin encoding rest of map by indicating that this will
* be a contiguous node map
*/
opal_dss.pack(&buf, &command, 1, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the prefix */
tmp = &prefix[0];
opal_dss.pack(&buf, &tmp, 1, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tmp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
len = strlen(prefix);
/* pack the number of digits in the index */
opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* and the starting offset */
opal_dss.pack(&buf, &firstnode, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &firstnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap:contig_nodes prefix %s num_digits %d offset %d",
@ -135,11 +156,17 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
if ((lastnode - firstnode) < 0) {
/* we are decrementing */
incdec = 0;
opal_dss.pack(&buf, &incdec, 1, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &incdec, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* we are incrementing */
incdec = 1;
opal_dss.pack(&buf, &incdec, 1, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &incdec, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
lastnode = firstnode;
@ -155,9 +182,15 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
}
if (step > 1) {
/* have a break - indicate end of range */
opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* indicate start of new range */
opal_dss.pack(&buf, &nodenum, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodenum, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap:contig_nodes end range %d start next range %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lastnode, nodenum));
@ -165,13 +198,19 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
lastnode = nodenum;
}
/* pack end of range */
opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap:contig_nodes end range %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lastnode));
/* pack flag end of ranges */
lastnode = -1;
opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* if the nodes aren't contiguous, then we need
* to simply pack every nodename individually
@ -181,7 +220,10 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* indicate that this will not be a contiguous node map */
command = ORTE_NON_CONTIG_NODE_CMD;
opal_dss.pack(&buf, &command, 1, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i=1; i < num_nodes; i++) {
if (!orte_keep_fqdn_hostnames) {
char *ptr;
@ -224,7 +266,10 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
}
vpids[i] = nodes[i]->daemon->name.vpid;
}
opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(vpids);
if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) {
@ -241,13 +286,19 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
* flag - no need to send everything
*/
num_digs = 0;
opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* it isn't homo, so we have to pass the
* archs to the daemons
*/
num_digs = 1;
opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* allocate space for the node arch */
arch = (int32_t*)malloc(num_nodes * 4);
/* transfer the data from the nodes */
@ -255,13 +306,19 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
arch[i] = nodes[i]->arch;
}
/* pack the values */
opal_dss.pack(&buf, arch, num_nodes, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, arch, num_nodes, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(arch);
}
} else {
/* pack a flag indicating that the archs are the same */
num_digs = 0;
opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* transfer the payload to the byte object */
@ -284,6 +341,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
int32_t index, step;
int32_t *arch;
opal_buffer_t buf;
int rc;
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s decode:nidmap decoding nodemap",
@ -309,14 +367,20 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
/* unpack number of nodes */
n=1;
opal_dss.unpack(&buf, &num_nodes, &n, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_nodes, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s decode:nidmap decoding %d nodes with %d already loaded",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes, nodes->lowest_free));
/* set the size of the nidmap storage so we minimize realloc's */
opal_pointer_array_set_size(nodes, num_nodes);
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(nodes, num_nodes))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* create the struct for the HNP's node */
node = OBJ_NEW(orte_nid_t);
@ -327,32 +391,53 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
/* unpack the name of the HNP's node */
n=1;
opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack flag to see if this is a contiguous node map or not */
n=1;
opal_dss.unpack(&buf, &command, &n, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &command, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_CONTIG_NODE_CMD == command) {
/* unpack the prefix */
n=1;
opal_dss.unpack(&buf, &prefix, &n, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &prefix, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* the number of digits in the index */
n=1;
opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* and the starting offset */
n=1;
opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack increment/decrement flag */
n=1;
opal_dss.unpack(&buf, &incdec, &n, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &incdec, &n, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the end of the range */
n=1;
opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup loop params */
if (0 == incdec) {
@ -397,7 +482,10 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
goto process_daemons;
}
n=1;
opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == incdec) {
endrange -= 1;
} else {
@ -417,7 +505,10 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
/* unpack the node's name */
n=1;
opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
@ -425,7 +516,10 @@ process_daemons:
/* unpack the daemon names */
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
n=num_nodes;
opal_dss.unpack(&buf, vpids, &n, ORTE_VPID);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, vpids, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* transfer the data to the nidmap, counting the number of
* daemons in the system
*/
@ -449,7 +543,10 @@ process_daemons:
* or could be that things just are homo anyway
*/
n=1;
opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == num_digs) {
/* homo situation */
orte_homogeneous_nodes = true;
@ -460,7 +557,10 @@ process_daemons:
arch = (int32_t*)malloc(num_nodes * 4);
/* unpack the values */
n=num_nodes;
opal_dss.unpack(&buf, arch, &n, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, arch, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* transfer the data to the nodes */
nd = (orte_nid_t**)nodes->addr;
for (i=0; i < num_nodes; i++) {
@ -491,12 +591,18 @@ int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr)
orte_vpid_t i;
int8_t *tmp, flag;
opal_buffer_t buf;
orte_local_rank_t *lrank;
orte_node_rank_t *nrank;
int rc;
/* setup the working buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* pack the number of procs */
opal_dss.pack(&buf, &jdata->num_procs, 1, ORTE_VPID);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* allocate memory for the nodes */
nodes = (int32_t*)malloc(jdata->num_procs * 4);
@ -506,45 +612,68 @@ int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr)
for (i=0; i < jdata->num_procs; i++) {
nodes[i] = procs[i]->node->index;
}
opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* free node storage */
free(nodes);
/* allocate memory for the local_ranks */
tmp = (int8_t*)malloc(jdata->num_procs);
lrank = (orte_local_rank_t*)malloc(jdata->num_procs*sizeof(orte_local_rank_t));
/* transfer and pack them in one pack */
for (i=0; i < jdata->num_procs; i++) {
tmp[i] = procs[i]->local_rank;
lrank[i] = procs[i]->local_rank;
}
opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, lrank, jdata->num_procs, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(lrank);
/* transfer and pack the node ranks in one pack */
nrank = (orte_node_rank_t*)malloc(jdata->num_procs*sizeof(orte_node_rank_t));
for (i=0; i < jdata->num_procs; i++) {
tmp[i] = procs[i]->node_rank;
nrank[i] = procs[i]->node_rank;
}
opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nrank, jdata->num_procs, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(nrank);
/* transfer and pack the app_idx in one pack */
tmp = (int8_t*)malloc(jdata->num_procs);
for (i=0; i < jdata->num_procs; i++) {
tmp[i] = procs[i]->app_idx;
}
opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_INT8);
/* free the storage */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp);
/* are there cpu_list strings? */
if (jdata->map->cpu_lists) {
flag = (int)true;
opal_dss.pack(&buf, &flag, 1, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i=0; i < jdata->num_procs; i++) {
opal_dss.pack(&buf, &procs[i]->slot_list, 1, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &procs[i]->slot_list, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
} else {
flag = (int)false;
opal_dss.pack(&buf, &flag, 1, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* transfer the payload to the byte object */
@ -562,20 +691,29 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo, orte_vpid_t *nprocs,
orte_vpid_t i, num_procs;
orte_pmap_t pmap;
int32_t *nodes;
int8_t *local_rank, *node_rank, *idx;
orte_local_rank_t *local_rank;
orte_node_rank_t *node_rank;
int8_t *idx;
int8_t flag;
char **slots;
orte_std_cntr_t n;
opal_buffer_t buf;
int rc;
/* xfer the byte object to a buffer for unpacking */
/* load it into a buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.load(&buf, bo->bytes, bo->size);
if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the number of procs */
n=1;
opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
*nprocs = num_procs;
/* allocate memory for the procs array */
@ -585,19 +723,28 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo, orte_vpid_t *nprocs,
nodes = (int32_t*)malloc(num_procs * 4);
/* unpack it in one shot */
n=num_procs;
opal_dss.unpack(&buf, nodes, &n, OPAL_INT32);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, nodes, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* allocate memory for local ranks */
local_rank = (int8_t*)malloc(num_procs);
local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t));
/* unpack them in one shot */
n=num_procs;
opal_dss.unpack(&buf, local_rank, &n, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, local_rank, &n, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* allocate memory for node ranks */
node_rank = (int8_t*)malloc(num_procs);
node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t));
/* unpack node ranks in one shot */
n=num_procs;
opal_dss.unpack(&buf, node_rank, &n, OPAL_UINT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, node_rank, &n, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* store the data */
for (i=0; i < num_procs; i++) {
@ -625,20 +772,29 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo, orte_vpid_t *nprocs,
idx = (int8_t*)malloc(num_procs);
/* unpack app_idx in one shot */
n=num_procs;
opal_dss.unpack(&buf, idx, &n, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, idx, &n, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* hand the array back to the caller */
*app_idx = idx;
/* unpack flag to indicate if slot_strings are present */
n=1;
opal_dss.unpack(&buf, &flag, &n, OPAL_INT8);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &flag, &n, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (flag) {
/* allocate space */
slots = (char**)malloc(num_procs * sizeof(char*));
for (i=0; i < num_procs; i++) {
n=1;
opal_dss.unpack(&buf, &slots[i], &n, OPAL_STRING);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &slots[i], &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
*slot_str = slots;
}