1
1

Modify the nidmap utility to pass daemon vpids for nodes. In some mapping algo's, it is possible for nodes to be skipped. This results in daemon vpids that differ from the index of their respective node in the node array, causing the daemon to not recognize procs that it is supposed to launch.

This commit was SVN r18528.
Этот коммит содержится в:
Ralph Castain 2008-05-28 18:38:47 +00:00
родитель 28c763f751
Коммит f76240e7cc
4 изменённых файлов: 101 добавлений и 39 удалений

Просмотреть файл

@ -363,8 +363,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
/* ident this proc's node */ /* ident this proc's node */
node = (orte_nid_t*)orte_daemonmap.addr[jobdat->procmap[j].node]; node = (orte_nid_t*)orte_daemonmap.addr[jobdat->procmap[j].node];
ORTE_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:constructing child list - checking proc %s on node %d with daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(j),
jobdat->procmap[j].node, ORTE_VPID_PRINT(node->daemon)));
/* does this data belong to us? */ /* does this data belong to us? */
if ((int32_t)ORTE_PROC_MY_NAME->vpid == jobdat->procmap[j].node) { if (ORTE_PROC_MY_NAME->vpid == node->daemon) {
ORTE_OUTPUT_VERBOSE((5, orte_odls_globals.output, ORTE_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:constructing child list - found proc %s for me!", "%s odls:constructing child list - found proc %s for me!",

Просмотреть файл

@ -90,7 +90,9 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
*/ */
{ {
opal_byte_object_t bo; opal_byte_object_t bo;
int i;
orte_nid_t **nodes;
/* construct a nodemap */ /* construct a nodemap */
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) { if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -102,7 +104,15 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo, &orte_daemonmap))) { if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo, &orte_daemonmap))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* print-out the map */
nodes = (orte_nid_t**)orte_daemonmap.addr;
for (i=0; i < orte_daemonmap.size; i++) {
if (NULL != nodes[i]) {
fprintf(stderr, "NIDMAP: name %s daemon %s arch %0x\n",
nodes[i]->name, ORTE_VPID_PRINT(nodes[i]->daemon), nodes[i]->arch);
}
}
} }
#endif #endif

Просмотреть файл

@ -270,6 +270,8 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_proc_t);
typedef struct { typedef struct {
/* nodename */ /* nodename */
char *name; char *name;
/* vpid of daemon on this node */
orte_vpid_t daemon;
/* arch of node */ /* arch of node */
uint32_t arch; uint32_t arch;
} orte_nid_t; } orte_nid_t;

Просмотреть файл

@ -36,8 +36,8 @@
int orte_util_encode_nodemap(opal_byte_object_t *boptr) int orte_util_encode_nodemap(opal_byte_object_t *boptr)
{ {
orte_job_t *jdata; orte_vpid_t *vpids;
orte_proc_t **procs; orte_node_t **nodes;
char prefix[ORTE_MAX_NODE_PREFIX], *tmp; char prefix[ORTE_MAX_NODE_PREFIX], *tmp;
int32_t i, len, firstnode, lastnode, nodenum, num_nodes; int32_t i, len, firstnode, lastnode, nodenum, num_nodes;
uint8_t command = ORTE_CONTIG_NODE_CMD; uint8_t command = ORTE_CONTIG_NODE_CMD;
@ -51,17 +51,16 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
int32_t *arch; int32_t *arch;
#endif #endif
/* get the daemon job's data */ /* setup a buffer for tmp use */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
procs = (orte_proc_t**)(jdata->procs)->addr;
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* indicate number of nodes */ /* determine the number of nodes in the global node array */
num_nodes = jdata->num_procs; num_nodes = 0;
nodes = (orte_node_t**)orte_node_pool->addr;
while (NULL != nodes[num_nodes]) {
++num_nodes;
}
/* pack number of nodes */
opal_dss.pack(&buf, &num_nodes, 1, OPAL_INT32); opal_dss.pack(&buf, &num_nodes, 1, OPAL_INT32);
/* pack the HNP's node name - don't mess with /* pack the HNP's node name - don't mess with
@ -72,14 +71,14 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
*/ */
if (!orte_keep_fqdn_hostnames) { if (!orte_keep_fqdn_hostnames) {
char *ptr; char *ptr;
nodename = strdup(procs[0]->nodename); nodename = strdup(nodes[0]->name);
if (NULL != (ptr = strchr(nodename, '.'))) { if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0'; *ptr = '\0';
} }
opal_dss.pack(&buf, &nodename, 1, OPAL_STRING); opal_dss.pack(&buf, &nodename, 1, OPAL_STRING);
free(nodename); free(nodename);
} else { } else {
opal_dss.pack(&buf, &procs[0]->nodename, 1, OPAL_STRING); opal_dss.pack(&buf, &nodes[0]->name, 1, OPAL_STRING);
} }
/* see if the cluster is configured with contiguous /* see if the cluster is configured with contiguous
@ -87,13 +86,13 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
*/ */
if (orte_contiguous_nodes < num_nodes) { if (orte_contiguous_nodes < num_nodes) {
/* discover the prefix - find first non-alpha character */ /* discover the prefix - find first non-alpha character */
len = strlen(procs[1]->nodename); len = strlen(nodes[1]->name);
memset(prefix, 0, ORTE_MAX_NODE_PREFIX); memset(prefix, 0, ORTE_MAX_NODE_PREFIX);
prefix[0] = procs[1]->nodename[0]; /* must start with alpha */ prefix[0] = nodes[1]->name[0]; /* must start with alpha */
for (i=1; i < len; i++) { for (i=1; i < len; i++) {
if (!isalpha(procs[1]->nodename[i])) { if (!isalpha(nodes[1]->name[i])) {
/* found a non-alpha char */ /* found a non-alpha char */
if (!isdigit(procs[1]->nodename[i])) { if (!isdigit(nodes[1]->name[i])) {
/* if it is anything but a digit, /* if it is anything but a digit,
* then that's not good * then that's not good
*/ */
@ -104,12 +103,12 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
/* okay, this defines end of the prefix. /* okay, this defines end of the prefix.
* convert rest of name to an offset * convert rest of name to an offset
*/ */
firstnode = strtol(&(procs[1]->nodename[i]), NULL, 10); firstnode = strtol(&(nodes[1]->name[i]), NULL, 10);
/* figure out how many digits are in the index */ /* figure out how many digits are in the index */
for (num_digs=0; isdigit(procs[1]->nodename[i+num_digs]); num_digs++); for (num_digs=0; isdigit(nodes[1]->name[i+num_digs]); num_digs++);
goto PACK; goto PACK;
} }
prefix[i] = procs[1]->nodename[i]; prefix[i] = nodes[1]->name[i];
} }
PACK: PACK:
@ -133,7 +132,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
"%s encode:nidmap:contig_nodes prefix %s num_digits %d offset %d", "%s encode:nidmap:contig_nodes prefix %s num_digits %d offset %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), prefix, num_digs, firstnode)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), prefix, num_digs, firstnode));
lastnode = strtol(&(procs[2]->nodename[i]), NULL, 10); lastnode = strtol(&(nodes[2]->name[i]), NULL, 10);
if ((lastnode - firstnode) < 0) { if ((lastnode - firstnode) < 0) {
/* we are decrementing */ /* we are decrementing */
incdec = 0; incdec = 0;
@ -148,8 +147,8 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
/* cycle through the nodes - pack the starting offset /* cycle through the nodes - pack the starting offset
* and total number of nodes in each contiguous range * and total number of nodes in each contiguous range
*/ */
for (i=2; i < (int)jdata->num_procs; i++) { for (i=2; i < num_nodes; i++) {
nodenum = strtol(&(procs[i]->nodename[len]), NULL, 10); nodenum = strtol(&(nodes[i]->name[len]), NULL, 10);
step = nodenum -lastnode; step = nodenum -lastnode;
if (step < 0) { if (step < 0) {
/* we are decrementing */ /* we are decrementing */
@ -187,7 +186,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
for (i=1; i < num_nodes; i++) { for (i=1; i < num_nodes; i++) {
if (!orte_keep_fqdn_hostnames) { if (!orte_keep_fqdn_hostnames) {
char *ptr; char *ptr;
nodename = strdup(procs[i]->nodename); nodename = strdup(nodes[i]->name);
if (NULL != (ptr = strchr(nodename, '.'))) { if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0'; *ptr = '\0';
} }
@ -197,7 +196,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
} }
free(nodename); free(nodename);
} else { } else {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &procs[i]->nodename, 1, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodes[i]->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -205,12 +204,36 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
} }
} }
/* since the daemon vpids may not correspond to the node
* index, we need to also pack the vpid array for all
* daemons. This scenario can happen when the user is
* employing a mapping algo that doesn't use all allocated
* nodes, and sprinkles procs across them in some non-contig
* manner. For example, use of the seq mapper where only
* some nodes are used, and where the usage leaves "holes"
* in the node array, will cause the daemon vpids to not
* match their node array index
*/
/* allocate space for the daemon vpids */
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
for (i=0; i < num_nodes; i++) {
if (NULL == nodes[i]->daemon) {
/* some nodes may not have daemons on them */
vpids[i] = ORTE_VPID_INVALID;
continue;
}
vpids[i] = nodes[i]->daemon->name.vpid;
}
opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID);
free(vpids);
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
/* allocate space for the node arch */ /* allocate space for the node arch */
arch = (int32_t*)malloc(num_nodes * 4); arch = (int32_t*)malloc(num_nodes * 4);
/* transfer the data from the nodes */ /* transfer the data from the nodes */
for (i=0; i < num_nodes; i++) { for (i=0; i < num_nodes; i++) {
arch[i] = procs[i]->node->arch; arch[i] = nodes[i]->arch;
} }
/* pack the values */ /* pack the values */
opal_dss.pack(&buf, arch, num_nodes, OPAL_INT32); opal_dss.pack(&buf, arch, num_nodes, OPAL_INT32);
@ -228,8 +251,9 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
{ {
int n, loc, k, diglen, namelen; int n, loc, k, diglen, namelen;
char *prefix, digits[10]; char *prefix, digits[10];
int32_t num_nodes, lastnode, endrange, i; int32_t num_nodes, lastnode, endrange, i, num_daemons;
orte_nid_t *node; orte_nid_t *node;
orte_vpid_t *vpids;
uint8_t command, num_digs; uint8_t command, num_digs;
orte_nid_t **nd; orte_nid_t **nd;
uint8_t incdec; uint8_t incdec;
@ -264,11 +288,6 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* if we are a daemon or the HNP, update our num_procs */
if (orte_process_info.hnp || orte_process_info.daemon) {
orte_process_info.num_procs = num_nodes;
}
/* set the size of the nidmap storage so we minimize /* set the size of the nidmap storage so we minimize
* realloc's * realloc's
*/ */
@ -341,6 +360,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
loc++; loc++;
} }
strncat(node->name, digits, num_digs); strncat(node->name, digits, num_digs);
node->daemon = ORTE_VPID_INVALID;
/* default the arch to our arch so that non-hetero /* default the arch to our arch so that non-hetero
* case will yield correct behavior * case will yield correct behavior
*/ */
@ -353,7 +373,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32); opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32);
/* if that is -1, then it flags no more ranges */ /* if that is -1, then it flags no more ranges */
if (-1 == lastnode) { if (-1 == lastnode) {
goto arch; goto vpids;
} }
n=1; n=1;
opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32); opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32);
@ -370,6 +390,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
for (i=1; i < num_nodes; i++) { for (i=1; i < num_nodes; i++) {
node = (orte_nid_t*)malloc(sizeof(orte_nid_t)); node = (orte_nid_t*)malloc(sizeof(orte_nid_t));
node->name = NULL; node->name = NULL;
node->daemon = ORTE_VPID_INVALID;
/* default the arch to our arch so that non-hetero /* default the arch to our arch so that non-hetero
* case will yield correct behavior * case will yield correct behavior
*/ */
@ -382,7 +403,30 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo, opal_pointer_array_t *nodes
} }
} }
arch: vpids:
/* unpack the daemon vpids */
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
n=num_nodes;
opal_dss.unpack(&buf, vpids, &n, ORTE_VPID);
nd = (orte_nid_t**)nodes->addr;
/* transfer the data to the nidmap, counting the number of
* daemons in the system
*/
num_daemons = 0;
for (i=0; i < num_nodes; i++) {
nd[i]->daemon = vpids[i];
if (ORTE_VPID_INVALID != vpids[i]) {
++num_daemons;
}
}
free(vpids);
/* if we are a daemon or the HNP, update our num_procs */
if (orte_process_info.hnp || orte_process_info.daemon) {
orte_process_info.num_procs = num_daemons;
}
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
/* allocate space for the node arch */ /* allocate space for the node arch */
arch = (int32_t*)malloc(num_nodes * 4); arch = (int32_t*)malloc(num_nodes * 4);
@ -400,9 +444,10 @@ arch:
if (0 < orte_output_get_verbosity(orte_debug_output)) { if (0 < orte_output_get_verbosity(orte_debug_output)) {
nd = (orte_nid_t**)nodes->addr; nd = (orte_nid_t**)nodes->addr;
for (i=0; i < num_nodes; i++) { for (i=0; i < num_nodes; i++) {
orte_output(0, "%s node[%d].name %s arch %0x", orte_output(0, "%s node[%d].name %s daemon %s arch %0x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
(NULL == nd[i]) ? "NULL" : nd[i]->name, (NULL == nd[i]) ? "NULL" : nd[i]->name,
ORTE_VPID_PRINT(nd[i]->daemon),
(NULL == nd[i]) ? 0 : nd[i]->arch); (NULL == nd[i]) ? 0 : nd[i]->arch);
} }
} }