The node index isn't normally passed with the packed node object, so we need to set it on the remote end as the orted needs to pass it down to the procs. Refactor the registration code to better package proc-level info - we will separate out the node and app levels in a subsequent change.
Этот коммит содержится в:
родитель
163999bce0
Коммит
d4327fd973
@ -54,12 +54,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
{
|
||||
int rc;
|
||||
orte_proc_t *pptr;
|
||||
int i, k;
|
||||
int i, k, n, nlocalprocs;
|
||||
opal_list_t *info, *pmap;
|
||||
opal_value_t *kv;
|
||||
orte_node_t *node, *n2;
|
||||
opal_vpid_t vpid;
|
||||
char **list, **procs, **micro, *tmp, *regex;
|
||||
char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
|
||||
orte_job_t *dmns;
|
||||
orte_job_map_t *map;
|
||||
orte_app_context_t *app;
|
||||
@ -178,92 +178,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.uint32 = node->index;
|
||||
opal_list_append(info, &kv->super);
|
||||
|
||||
/* identify our local node object within the map,
|
||||
* if we were included */
|
||||
node = NULL;
|
||||
map = (orte_job_map_t*)jdata->map;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (n2 == pptr->node) {
|
||||
node = n2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL != node) {
|
||||
/* node size */
|
||||
/* pass our node size */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_NODE_SIZE);
|
||||
kv->type = OPAL_UINT32;
|
||||
kv->data.uint32 = node->num_procs;
|
||||
opal_list_append(info, &kv->super);
|
||||
/* construct the list of local peers, while adding
|
||||
* each proc's locality info */
|
||||
list = NULL;
|
||||
procs = NULL;
|
||||
vpid = ORTE_VPID_MAX;
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (pptr->name.jobid == jdata->jobid) {
|
||||
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
|
||||
if (pptr->name.vpid < vpid) {
|
||||
vpid = pptr->name.vpid;
|
||||
}
|
||||
/* note that we have to pass the cpuset for each local
|
||||
* peer so locality can be computed */
|
||||
tmp = NULL;
|
||||
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
|
||||
if (NULL != tmp) {
|
||||
opal_argv_append_nosize(&procs, tmp);
|
||||
free(tmp);
|
||||
} else {
|
||||
opal_argv_append_nosize(&procs, "UNBOUND");
|
||||
}
|
||||
} else {
|
||||
opal_argv_append_nosize(&procs, "UNBOUND");
|
||||
}
|
||||
/* go ahead and register this client */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
|
||||
(void*)pptr, NULL, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* construct the list of peers for transmission */
|
||||
if (NULL != list) {
|
||||
tmp = opal_argv_join(list, ',');
|
||||
opal_argv_free(list);
|
||||
list = NULL;
|
||||
/* pass the list of peers */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = tmp;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
/* construct the list of cpusets for transmission */
|
||||
if (NULL != procs) {
|
||||
tmp = opal_argv_join(procs, ':');
|
||||
opal_argv_free(procs);
|
||||
procs = NULL;
|
||||
/* pass the list of cpusets */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = tmp;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
/* pass the local ldr */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCALLDR);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = vpid;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
|
||||
/* univ size */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
@ -300,16 +220,108 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.uint32 = jdata->total_slots_alloc;
|
||||
opal_list_append(info, &kv->super);
|
||||
|
||||
/* identify our local node object within the map,
|
||||
* if we were included */
|
||||
node = NULL;
|
||||
map = (orte_job_map_t*)jdata->map;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (n2 == pptr->node) {
|
||||
node = n2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL != node) {
|
||||
vpid = ORTE_VPID_MAX;
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (pptr->name.jobid == jdata->jobid) {
|
||||
if (pptr->name.vpid < vpid) {
|
||||
vpid = pptr->name.vpid;
|
||||
}
|
||||
/* go ahead and register this client */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
|
||||
(void*)pptr, NULL, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* pass the local ldr */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCALLDR);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = vpid;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
|
||||
/* for each proc in this job, create an object that
|
||||
* includes the info describing the proc so the recipient has a complete
|
||||
* picture. This allows procs to connect to each other without
|
||||
* an further info exchange, assuming the underlying transports
|
||||
* support it */
|
||||
* any further info exchange, assuming the underlying transports
|
||||
* support it. We also pass all the proc-specific data here so
|
||||
* that each proc can lookup info about every other proc in the job */
|
||||
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
for (n=0; n < map->nodes->size; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
/* construct the list of local peers, while adding
|
||||
* each proc's locality info */
|
||||
list = NULL;
|
||||
procs = NULL;
|
||||
cpulist = NULL;
|
||||
peerlist = NULL;
|
||||
vpid = ORTE_VPID_MAX;
|
||||
nlocalprocs = 0;
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (pptr->name.jobid == jdata->jobid) {
|
||||
++nlocalprocs;
|
||||
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
|
||||
if (pptr->name.vpid < vpid) {
|
||||
vpid = pptr->name.vpid;
|
||||
}
|
||||
/* note that we have to pass the cpuset for each local
|
||||
* peer so locality can be computed */
|
||||
tmp = NULL;
|
||||
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
|
||||
if (NULL != tmp) {
|
||||
opal_argv_append_nosize(&procs, tmp);
|
||||
free(tmp);
|
||||
} else {
|
||||
opal_argv_append_nosize(&procs, "UNBOUND");
|
||||
}
|
||||
} else {
|
||||
opal_argv_append_nosize(&procs, "UNBOUND");
|
||||
}
|
||||
}
|
||||
}
|
||||
/* construct the list of peers for transmission */
|
||||
if (NULL != list) {
|
||||
peerlist = opal_argv_join(list, ',');
|
||||
opal_argv_free(list);
|
||||
list = NULL;
|
||||
}
|
||||
/* construct the list of cpusets for transmission */
|
||||
if (NULL != procs) {
|
||||
cpulist = opal_argv_join(procs, ':');
|
||||
opal_argv_free(procs);
|
||||
procs = NULL;
|
||||
}
|
||||
|
||||
/* now cycle across each proc on this node, passing all data that
|
||||
* varies by proc */
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
/* setup the proc map object */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_PROC_DATA);
|
||||
kv->type = OPAL_PTR;
|
||||
@ -317,13 +329,27 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
opal_list_append(info, &kv->super);
|
||||
pmap = kv->data.ptr;
|
||||
|
||||
/* rank */
|
||||
/* must start with rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_RANK);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = pptr->name.vpid;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* pass the list of peers */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = peerlist;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* pass the list of cpusets */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = cpulist;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* appnum */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APPNUM);
|
||||
@ -388,6 +414,14 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.uint32 = pptr->node->index;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
}
|
||||
/* cleanup */
|
||||
if (NULL != cpulist) {
|
||||
free(cpulist);
|
||||
}
|
||||
if (NULL != peerlist) {
|
||||
free(peerlist);
|
||||
}
|
||||
}
|
||||
|
||||
/* mark the job as registered */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NSPACE_REGISTERED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
|
||||
|
@ -280,6 +280,8 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* set the nodeid */
|
||||
node->index = vpid;
|
||||
/* do we already have this node? */
|
||||
nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, vpid);
|
||||
/* set the new node object into the array */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user