1
1

Merge pull request #1959 from rhc54/topic/nodeid

The node index isn't normally passed with the packed node object, so …
Этот коммит содержится в:
rhc54 2016-08-12 13:30:10 -07:00 коммит произвёл GitHub
родитель 1ef3c86d44 d4327fd973
Коммит 8d67f753ca
2 изменённых файлов: 195 добавлений и 159 удалений

Просмотреть файл

@ -54,12 +54,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
{
int rc;
orte_proc_t *pptr;
int i, k;
int i, k, n, nlocalprocs;
opal_list_t *info, *pmap;
opal_value_t *kv;
orte_node_t *node, *n2;
opal_vpid_t vpid;
char **list, **procs, **micro, *tmp, *regex;
char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
orte_job_t *dmns;
orte_job_map_t *map;
orte_app_context_t *app;
@ -178,92 +178,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.uint32 = node->index;
opal_list_append(info, &kv->super);
/* identify our local node object within the map,
* if we were included */
node = NULL;
map = (orte_job_map_t*)jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
if (n2 == pptr->node) {
node = n2;
break;
}
}
if (NULL != node) {
/* node size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODE_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = node->num_procs;
opal_list_append(info, &kv->super);
/* construct the list of local peers, while adding
* each proc's locality info */
list = NULL;
procs = NULL;
vpid = ORTE_VPID_MAX;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (pptr->name.jobid == jdata->jobid) {
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid;
}
/* note that we have to pass the cpuset for each local
* peer so locality can be computed */
tmp = NULL;
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
if (NULL != tmp) {
opal_argv_append_nosize(&procs, tmp);
free(tmp);
} else {
opal_argv_append_nosize(&procs, "UNBOUND");
}
} else {
opal_argv_append_nosize(&procs, "UNBOUND");
}
/* go ahead and register this client */
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
(void*)pptr, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
}
}
}
/* construct the list of peers for transmission */
if (NULL != list) {
tmp = opal_argv_join(list, ',');
opal_argv_free(list);
list = NULL;
/* pass the list of peers */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
kv->type = OPAL_STRING;
kv->data.string = tmp;
opal_list_append(info, &kv->super);
}
/* construct the list of cpusets for transmission */
if (NULL != procs) {
tmp = opal_argv_join(procs, ':');
opal_argv_free(procs);
procs = NULL;
/* pass the list of cpusets */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
kv->type = OPAL_STRING;
kv->data.string = tmp;
opal_list_append(info, &kv->super);
}
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = vpid;
opal_list_append(info, &kv->super);
}
/* pass our node size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODE_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = node->num_procs;
opal_list_append(info, &kv->super);
/* univ size */
kv = OBJ_NEW(opal_value_t);
@ -300,93 +220,207 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.uint32 = jdata->total_slots_alloc;
opal_list_append(info, &kv->super);
/* identify our local node object within the map,
* if we were included */
node = NULL;
map = (orte_job_map_t*)jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
if (n2 == pptr->node) {
node = n2;
break;
}
}
if (NULL != node) {
vpid = ORTE_VPID_MAX;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (pptr->name.jobid == jdata->jobid) {
if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid;
}
/* go ahead and register this client */
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
(void*)pptr, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
}
}
}
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = vpid;
opal_list_append(info, &kv->super);
}
/* for each proc in this job, create an object that
* includes the info describing the proc so the recipient has a complete
* picture. This allows procs to connect to each other without
* an further info exchange, assuming the underlying transports
* support it */
* any further info exchange, assuming the underlying transports
* support it. We also pass all the proc-specific data here so
* that each proc can lookup info about every other proc in the job */
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
for (n=0; n < map->nodes->size; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_PROC_DATA);
kv->type = OPAL_PTR;
kv->data.ptr = OBJ_NEW(opal_list_t);
opal_list_append(info, &kv->super);
pmap = kv->data.ptr;
/* construct the list of local peers, while adding
* each proc's locality info */
list = NULL;
procs = NULL;
cpulist = NULL;
peerlist = NULL;
vpid = ORTE_VPID_MAX;
nlocalprocs = 0;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (pptr->name.jobid == jdata->jobid) {
++nlocalprocs;
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid;
}
/* note that we have to pass the cpuset for each local
* peer so locality can be computed */
tmp = NULL;
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
if (NULL != tmp) {
opal_argv_append_nosize(&procs, tmp);
free(tmp);
} else {
opal_argv_append_nosize(&procs, "UNBOUND");
}
} else {
opal_argv_append_nosize(&procs, "UNBOUND");
}
}
}
/* construct the list of peers for transmission */
if (NULL != list) {
peerlist = opal_argv_join(list, ',');
opal_argv_free(list);
list = NULL;
}
/* construct the list of cpusets for transmission */
if (NULL != procs) {
cpulist = opal_argv_join(procs, ':');
opal_argv_free(procs);
procs = NULL;
}
/* rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid;
opal_list_append(pmap, &kv->super);
/* now cycle across each proc on this node, passing all data that
* varies by proc */
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
/* setup the proc map object */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_PROC_DATA);
kv->type = OPAL_PTR;
kv->data.ptr = OBJ_NEW(opal_list_t);
opal_list_append(info, &kv->super);
pmap = kv->data.ptr;
/* appnum */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPNUM);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->app_idx;
opal_list_append(pmap, &kv->super);
/* must start with rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid;
opal_list_append(pmap, &kv->super);
/* app ldr */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = app->first_rank;
opal_list_append(pmap, &kv->super);
/* pass the list of peers */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
kv->type = OPAL_STRING;
kv->data.string = peerlist;
opal_list_append(pmap, &kv->super);
/* global/univ rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
opal_list_append(pmap, &kv->super);
/* pass the list of cpusets */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
kv->type = OPAL_STRING;
kv->data.string = cpulist;
opal_list_append(pmap, &kv->super);
/* app rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->app_rank;
opal_list_append(pmap, &kv->super);
/* appnum */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPNUM);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->app_idx;
opal_list_append(pmap, &kv->super);
/* app size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = app->num_procs;
opal_list_append(info, &kv->super);
/* app ldr */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = app->first_rank;
opal_list_append(pmap, &kv->super);
/* local rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_RANK);
kv->type = OPAL_UINT16;
kv->data.uint16 = pptr->local_rank;
opal_list_append(pmap, &kv->super);
/* global/univ rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
opal_list_append(pmap, &kv->super);
/* node rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODE_RANK);
kv->type = OPAL_UINT16;
kv->data.uint32 = pptr->node_rank;
opal_list_append(pmap, &kv->super);
/* app rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->app_rank;
opal_list_append(pmap, &kv->super);
/* hostname */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_HOSTNAME);
kv->type = OPAL_STRING;
kv->data.string = strdup(pptr->node->name);
opal_list_append(pmap, &kv->super);
/* app size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = app->num_procs;
opal_list_append(info, &kv->super);
/* node ID */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODEID);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->node->index;
opal_list_append(pmap, &kv->super);
/* local rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_RANK);
kv->type = OPAL_UINT16;
kv->data.uint16 = pptr->local_rank;
opal_list_append(pmap, &kv->super);
/* node rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODE_RANK);
kv->type = OPAL_UINT16;
kv->data.uint32 = pptr->node_rank;
opal_list_append(pmap, &kv->super);
/* hostname */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_HOSTNAME);
kv->type = OPAL_STRING;
kv->data.string = strdup(pptr->node->name);
opal_list_append(pmap, &kv->super);
/* node ID */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODEID);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->node->index;
opal_list_append(pmap, &kv->super);
}
/* cleanup */
if (NULL != cpulist) {
free(cpulist);
}
if (NULL != peerlist) {
free(peerlist);
}
}
/* mark the job as registered */

Просмотреть файл

@ -280,6 +280,8 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the nodeid */
node->index = vpid;
/* do we already have this node? */
nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, vpid);
/* set the new node object into the array */