1
1

Fix the nidmap computation to deal with hetero nodes

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-05-09 16:06:15 -07:00
родитель 026f3dd2dd
Коммит 442e307a6e
3 изменённых файлов: 89 добавлений и 67 удалений

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -108,6 +108,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
true, jdata->num_apps, NULL); true, jdata->num_apps, NULL);
rc = ORTE_ERR_SILENT; rc = ORTE_ERR_SILENT;
opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__);
goto error; goto error;
} }
@ -118,6 +119,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->mapping, initial_map, false))) { jdata->map->mapping, initial_map, false))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__);
goto error; goto error;
} }
/* flag that all subsequent requests should not reset the node->mapped flag */ /* flag that all subsequent requests should not reset the node->mapped flag */
@ -236,10 +238,12 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
true, "mapping", true, "mapping",
orte_rmaps_base_print_mapping(jdata->map->mapping)); orte_rmaps_base_print_mapping(jdata->map->mapping));
rc = ORTE_ERR_SILENT; rc = ORTE_ERR_SILENT;
opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__);
goto error; goto error;
} }
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__);
goto error; goto error;
} }
@ -249,6 +253,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
*/ */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__);
return rc; return rc;
} }
@ -270,6 +275,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__);
while(NULL != (item = opal_list_remove_first(&node_list))) { while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }

Просмотреть файл

@ -493,6 +493,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app, orte_process_info.nodename); true, app->num_procs, app->app, orte_process_info.nodename);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
} }
@ -510,6 +511,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
if (NULL == node->topology || NULL == node->topology->topo) { if (NULL == node->topology || NULL == node->topology->topo) {
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
true, node->name); true, node->name);
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
start = 0; start = 0;
@ -548,6 +550,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
/* add this node to the map, if reqd */ /* add this node to the map, if reqd */
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
ORTE_ERROR_LOG(idx); ORTE_ERROR_LOG(idx);
return idx; return idx;
} }
@ -566,15 +569,18 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
/* get the hwloc object */ /* get the hwloc object */
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) {
orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true,
orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj),
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
nprocs_mapped++; nprocs_mapped++;
@ -601,12 +607,14 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app, orte_process_info.nodename); true, app->num_procs, app->app, orte_process_info.nodename);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
/* if we were explicitly told not to oversubscribe, then don't */ /* if we were explicitly told not to oversubscribe, then don't */
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app, orte_process_info.nodename); true, app->num_procs, app->app, orte_process_info.nodename);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
} }
@ -621,6 +629,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
if (nprocs_mapped < app->num_procs) { if (nprocs_mapped < app->num_procs) {
/* usually means there were no objects of the requested type */ /* usually means there were no objects of the requested type */
opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }

Просмотреть файл

@ -494,34 +494,50 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
return rc; return rc;
} }
for (n=0; n < orte_node_pool->size; n++) { /* there is always one topology - our own - so start with it */
nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
tp = OBJ_NEW(orte_regex_range_t);
tp->t = nptr->topology;
tp->cnt = 1;
opal_list_append(&topos, &tp->super);
/* likewise, we have slots */
slt = OBJ_NEW(orte_regex_range_t);
slt->slots = nptr->slots;
slt->cnt = 1;
opal_list_append(&slots, &slt->super);
/* and flags */
flg = OBJ_NEW(orte_regex_range_t);
if (ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
flg->slots = 1;
} else {
flg->slots = 0;
}
flg->cnt = 1;
opal_list_append(&flags, &flg->super);
for (n=1; n < orte_node_pool->size; n++) {
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
continue; continue;
} }
/* check the #slots */ /* check the #slots */
if (NULL == slt) { /* is this the next in line */
/* just starting */ if (nptr->slots == slt->slots) {
slt->cnt++;
} else {
/* need to start another range */
slt = OBJ_NEW(orte_regex_range_t); slt = OBJ_NEW(orte_regex_range_t);
slt->slots = nptr->slots; slt->slots = nptr->slots;
slt->cnt = 1; slt->cnt = 1;
opal_list_append(&slots, &slt->super); opal_list_append(&slots, &slt->super);
} else {
/* is this the next in line */
if (nptr->slots == slt->slots) {
slt->cnt++;
} else {
/* need to start another range */
slt = OBJ_NEW(orte_regex_range_t);
slt->slots = nptr->slots;
slt->cnt = 1;
opal_list_append(&slots, &slt->super);
}
} }
/* check the topologies */ /* check the topologies */
if (NULL == tp) { if (NULL == nptr->topology) {
/* just starting */ /* we don't know this topology, likely because
* we don't have a daemon on the node */
tp = OBJ_NEW(orte_regex_range_t); tp = OBJ_NEW(orte_regex_range_t);
tp->t = nptr->topology; tp->t = NULL;
tp->cnt = 1; tp->cnt = 1;
opal_list_append(&topos, &tp->super); opal_list_append(&topos, &tp->super);
} else { } else {
@ -538,8 +554,12 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
} }
/* check the flags */ /* check the flags */
test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN);
if (NULL == flg) { /* is this the next in line */
/* just starting */ if ((test && 1 == flg->slots) ||
(!test && 0 == flg->slots)) {
flg->cnt++;
} else {
/* need to start another range */
flg = OBJ_NEW(orte_regex_range_t); flg = OBJ_NEW(orte_regex_range_t);
if (test) { if (test) {
flg->slots = 1; flg->slots = 1;
@ -548,22 +568,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
} }
flg->cnt = 1; flg->cnt = 1;
opal_list_append(&flags, &flg->super); opal_list_append(&flags, &flg->super);
} else {
/* is this the next in line */
if ((test && 1 == flg->slots) ||
(!test && 0 == flg->slots)) {
flg->cnt++;
} else {
/* need to start another range */
flg = OBJ_NEW(orte_regex_range_t);
if (test) {
flg->slots = 1;
} else {
flg->slots = 0;
}
flg->cnt = 1;
opal_list_append(&flags, &flg->super);
}
} }
} }
@ -581,7 +585,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
OBJ_RELEASE(rng); OBJ_RELEASE(rng);
} }
OPAL_LIST_DESTRUCT(&slots); OPAL_LIST_DESTRUCT(&slots);
/* pack the string */ /* pack the string */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -640,13 +643,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
OBJ_CONSTRUCT(&bucket, opal_buffer_t); OBJ_CONSTRUCT(&bucket, opal_buffer_t);
while (NULL != (item = opal_list_remove_first(&topos))) { while (NULL != (item = opal_list_remove_first(&topos))) {
rng = (orte_regex_range_t*)item; rng = (orte_regex_range_t*)item;
if (NULL == rng->t) {
/* when we pass thru here prior to launching the daemons, we
* won't have topologies for them and so this entry might
* be NULL - protect ourselves */
OBJ_RELEASE(item);
continue;
}
if (NULL == tmp) { if (NULL == tmp) {
asprintf(&tmp, "%d", rng->cnt); asprintf(&tmp, "%d", rng->cnt);
} else { } else {
@ -654,28 +650,40 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
free(tmp); free(tmp);
tmp = tmp2; tmp = tmp2;
} }
/* pack this topology string */ if (NULL == rng->t) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { /* need to account for NULL topology */
ORTE_ERROR_LOG(rc); tmp2 = NULL;
OBJ_RELEASE(rng); if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) {
OPAL_LIST_DESTRUCT(&topos); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&bucket); OBJ_RELEASE(rng);
free(tmp); OPAL_LIST_DESTRUCT(&topos);
return rc; OBJ_DESTRUCT(&bucket);
} free(tmp);
/* pack the topology itself */ return rc;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { }
ORTE_ERROR_LOG(rc); } else {
OBJ_RELEASE(rng); /* pack this topology string */
OPAL_LIST_DESTRUCT(&topos); if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) {
OBJ_DESTRUCT(&bucket); ORTE_ERROR_LOG(rc);
free(tmp); OBJ_RELEASE(rng);
return rc; OPAL_LIST_DESTRUCT(&topos);
OBJ_DESTRUCT(&bucket);
free(tmp);
return rc;
}
/* pack the topology itself */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(rng);
OPAL_LIST_DESTRUCT(&topos);
OBJ_DESTRUCT(&bucket);
free(tmp);
return rc;
}
} }
OBJ_RELEASE(rng); OBJ_RELEASE(rng);
} }
OPAL_LIST_DESTRUCT(&topos); OPAL_LIST_DESTRUCT(&topos);
/* pack the string */ /* pack the string */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -1029,11 +1037,10 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
goto cleanup; goto cleanup;
} }
if (NULL == sig) { if (NULL == sig) {
rc = ORTE_ERR_BAD_PARAM; /* the nodes in this range have not reported a topology,
ORTE_ERROR_LOG(rc); * so skip them */
opal_argv_free(tmp); offset += cnt;
OBJ_RELEASE(bptr); continue;
goto cleanup;
} }
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) {