1
1

Correct the error codes and clarify/correct logic

Every mapper is required to set the locale, which is why it is an error
if the locale attribute isn't found. Likewise, it is an error for any
mapper to set a NULL locale as it makes no sense. However, I can see
that maybe some compiler or static code checker might want to see
concrete evidence we checked it - so check it in the right place.

Backport the equivalent code from PRRTE as we know that works - more
confidence than trying to add another patch to this old code.

Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
Ralph Castain 2020-04-18 06:32:07 -07:00
родитель 1cd89c9d7b
Коммит 79c426885a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B63B630167D26BB5

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
@ -138,13 +138,18 @@ static int rank_span(orte_job_t *jdata,
}
/* protect against bozo case */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
NULL == locale) {
/* all mappers are _required_ to set the locale where the proc
* has been mapped - it is therefore an error for this attribute
* not to be set. Likewise, only a programming error could allow
* the attribute to be set to a NULL value - however, we add that
* conditional here to silence any compiler warnings */
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (NULL == locale ||
!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_span: proc at position %d is not on object %d",
j, i);
@ -180,7 +185,7 @@ static int rank_span(orte_job_t *jdata,
/* Are all the procs ranked? we don't want to crash on INVALID ranks */
if (cnt < app->num_procs) {
return ORTE_ERR_NOT_SUPPORTED;
return ORTE_ERR_FAILED_TO_MAP;
}
}
@ -265,13 +270,18 @@ static int rank_fill(orte_job_t *jdata,
}
/* protect against bozo case */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
NULL == locale) {
/* all mappers are _required_ to set the locale where the proc
* has been mapped - it is therefore an error for this attribute
* not to be set. Likewise, only a programming error could allow
* the attribute to be set to a NULL value - however, we add that
* conditional here to silence any compiler warnings */
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (NULL == locale ||
!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: proc at position %d is not on object %d",
j, i);
@ -304,7 +314,7 @@ static int rank_fill(orte_job_t *jdata,
/* Are all the procs ranked? we don't want to crash on INVALID ranks */
if (cnt < app->num_procs) {
return ORTE_ERR_NOT_SUPPORTED;
return ORTE_ERR_FAILED_TO_MAP;
}
}
@ -321,11 +331,12 @@ static int rank_by(orte_job_t *jdata,
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc, *pptr;
orte_vpid_t vpid, np;
orte_vpid_t vpid;
int cnt;
opal_pointer_array_t objs;
hwloc_obj_t locale;
orte_app_idx_t napp;
bool noassign;
if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
return rank_span(jdata, target, cache_level);
@ -385,89 +396,97 @@ static int rank_by(orte_job_t *jdata,
* of procs on the node can't be used to tell us when we
* are done. Instead, we have to just keep going until all
* procs are ranked - which means we have to make one extra
* pass thru the loop
* pass thru the loop. In addition, if we pass thru the entire
* loop without assigning anything then we are done
*
* Perhaps someday someone will come up with a more efficient
* algorithm, but this works for now.
*/
i = 0;
while (cnt < app->num_procs &&
((i < (int)node->num_procs) || (i < num_objs))) {
/* get the next object */
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i % num_objs);
if (NULL == obj) {
while (cnt < app->num_procs) {
noassign = true;
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
/* get the next object */
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
if (NULL == obj) {
break;
}
/* scan across the procs and find the first unassigned one that includes this object */
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs that are already ranked */
if (ORTE_VPID_INVALID != proc->name.vpid) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs from other apps - we will get to them */
if (proc->app_idx != app->idx) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* protect against bozo case */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
NULL == locale) {
/* all mappers are _required_ to set the locale where the proc
* has been mapped - it is therefore an error for this attribute
* not to be set. Likewise, only a programming error could allow
* the attribute to be set to a NULL value - however, we add that
* conditional here to silence any compiler warnings */
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by: proc at position %d is not on object %d",
j, i);
continue;
}
/* assign the vpid */
proc->name.vpid = vpid;
if (0 == cnt) {
app->first_rank = proc->name.vpid;
}
cnt++;
noassign = false;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s",
j, i, ORTE_VPID_PRINT(proc->name.vpid));
/* insert the proc into the jdata array */
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
OBJ_RELEASE(pptr);
}
OBJ_RETAIN(proc);
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&objs);
return rc;
}
num_ranked++;
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
/* move to next object */
break;
}
}
if (noassign) {
break;
}
/* scan across the procs and find the one that is on this object */
np = 0;
for (j=0; np < node->num_procs && j < node->procs->size && cnt < app->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
np++;
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs that are already ranked */
if (ORTE_VPID_INVALID != proc->name.vpid) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* protect against bozo case */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (NULL == locale ||
!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by: proc at position %d is not on object %d",
j, i % num_objs);
continue;
}
/* assign the vpid */
proc->name.vpid = vpid++;
if (0 == cnt) {
app->first_rank = proc->name.vpid;
}
cnt++;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s",
j, i, ORTE_VPID_PRINT(proc->name.vpid));
/* insert the proc into the jdata array */
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
OBJ_RELEASE(pptr);
}
OBJ_RETAIN(proc);
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&objs);
return rc;
}
num_ranked++;
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
/* move to next object */
break;
}
i++;
}
}
/* cleanup */
@ -475,7 +494,7 @@ static int rank_by(orte_job_t *jdata,
/* Are all the procs ranked? we don't want to crash on INVALID ranks */
if (cnt < app->num_procs) {
return ORTE_ERR_NOT_SUPPORTED;
return ORTE_ERR_FAILED_TO_MAP;
}
}
return ORTE_SUCCESS;