diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 863e959e33..06b621383c 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -108,6 +108,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -118,6 +119,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ @@ -236,10 +238,12 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) true, "mapping", orte_rmaps_base_print_mapping(jdata->map->mapping)); rc = ORTE_ERR_SILENT; + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -249,6 +253,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); return rc; } @@ -270,6 +275,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return ORTE_SUCCESS; error: + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index c0b08e2a03..8c2c9925e4 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -493,6 +493,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -510,6 +511,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (NULL == node->topology || NULL == node->topology->topo) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } start = 0; @@ -548,6 +550,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* add this node to the map, if reqd */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); ORTE_ERROR_LOG(idx); return idx; } @@ -566,15 +569,18 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; @@ -601,12 +607,14 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -621,6 +629,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (nprocs_mapped < app->num_procs) { /* usually means there were no objects of the requested type */ + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 11bd366d34..ef7509e2a8 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -494,34 +494,50 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) return rc; } - for (n=0; n < orte_node_pool->size; n++) { + /* there is always one topology - our own - so start with it */ + nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + opal_list_append(&topos, &tp->super); + + /* likewise, we have slots */ + slt = OBJ_NEW(orte_regex_range_t); + slt->slots = nptr->slots; + slt->cnt = 1; + opal_list_append(&slots, &slt->super); + + /* and flags */ + flg = OBJ_NEW(orte_regex_range_t); + if (ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN)) { + flg->slots = 1; + } else { + flg->slots = 0; + } + flg->cnt = 1; + opal_list_append(&flags, &flg->super); + + for (n=1; n < orte_node_pool->size; n++) { if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { continue; } /* check the #slots */ - if (NULL == slt) { - /* just starting */ + /* is this the next in line */ + if (nptr->slots == slt->slots) { + slt->cnt++; + } else { + /* need to start another range */ slt = OBJ_NEW(orte_regex_range_t); slt->slots = nptr->slots; slt->cnt = 1; opal_list_append(&slots, &slt->super); - } else { - /* is this the next in line */ - if (nptr->slots == slt->slots) { - slt->cnt++; - } else { - /* need to start another range */ - slt = OBJ_NEW(orte_regex_range_t); - slt->slots = nptr->slots; - slt->cnt = 1; - opal_list_append(&slots, &slt->super); - } } /* check the topologies */ - if (NULL == tp) { - /* just starting */ + if (NULL == nptr->topology) { + /* we don't know this topology, likely because + * we don't have a daemon on the node */ tp = OBJ_NEW(orte_regex_range_t); - tp->t = nptr->topology; + tp->t = NULL; tp->cnt = 1; opal_list_append(&topos, &tp->super); } else { @@ -538,8 +554,12 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } /* check the flags */ test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); - if (NULL == flg) { - /* just starting */ + /* is this the next in line */ + if ((test && 1 == flg->slots) || + (!test && 0 == flg->slots)) { + flg->cnt++; + } else { + /* need to start another range */ flg = OBJ_NEW(orte_regex_range_t); if (test) { flg->slots = 1; @@ -548,22 +568,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } flg->cnt = 1; opal_list_append(&flags, &flg->super); - } else { - /* is this the next in line */ - if ((test && 1 == flg->slots) || - (!test && 0 == flg->slots)) { - flg->cnt++; - } else { - /* need to start another range */ - flg = OBJ_NEW(orte_regex_range_t); - if (test) { - flg->slots = 1; - } else { - flg->slots = 0; - } - flg->cnt = 1; - opal_list_append(&flags, &flg->super); - } } } @@ -581,7 +585,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&slots); - /* pack the string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -640,13 +643,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OBJ_CONSTRUCT(&bucket, opal_buffer_t); while (NULL != (item = opal_list_remove_first(&topos))) { rng = (orte_regex_range_t*)item; - if (NULL == rng->t) { - /* when we pass thru here prior to launching the daemons, we - * won't have topologies for them and so this entry might - * be NULL - protect ourselves */ - OBJ_RELEASE(item); - continue; - } if (NULL == tmp) { asprintf(&tmp, "%d", rng->cnt); } else { @@ -654,28 +650,40 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) free(tmp); tmp = tmp2; } - /* pack this topology string */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; - } - /* pack the topology itself */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; + if (NULL == rng->t) { + /* need to account for NULL topology */ + tmp2 = NULL; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + } else { + /* pack this topology string */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + /* pack the topology itself */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } } OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&topos); - /* pack the string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -1029,11 +1037,10 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) goto cleanup; } if (NULL == sig) { - rc = ORTE_ERR_BAD_PARAM; - ORTE_ERROR_LOG(rc); - opal_argv_free(tmp); - OBJ_RELEASE(bptr); - goto cleanup; + /* the nodes in this range have not reported a topology, + * so skip them */ + offset += cnt; + continue; } n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) {