From 7cc5879bdd7534605e92f28f33405a7a8a3dc4b3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 21 Dec 2015 09:47:27 -0800 Subject: [PATCH] Fix the default slot mapping in rank file mapper --- orte/mca/rmaps/rank_file/rmaps_rank_file.c | 88 +++++++++++++++------- 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 854451aea6..a43d8d7bed 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -196,10 +196,42 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) rank = vpid_start + k; /* get the rankfile entry for this rank */ if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) { - /* all ranks must be specified */ - orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile); - rc = ORTE_ERR_SILENT; - goto error; + /* if we were give a default slot-list, then use it */ + if (NULL != opal_hwloc_base_slot_list) { + slots = opal_hwloc_base_slot_list; + /* take the next node off of the available list */ + node = NULL; + OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) { + /* if adding one to this node would oversubscribe it, then try + * the next one */ + if (nd->slots <= (int)nd->num_procs) { + continue; + } + /* take this one */ + node = nd; + break; + } + if (NULL == node) { + /* all would be oversubscribed, so take the least loaded one */ + k = UINT32_MAX; + OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) { + if (nd->num_procs < k) { + k = nd->num_procs; + node = nd; + } + } + } + /* if we still have nothing, then something is very wrong */ + if (NULL == node) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto error; + } + } else { + /* all ranks must be specified */ + orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile); + rc = ORTE_ERR_SILENT; + goto error; + } } else { if (0 == strlen(rfmap->slot_list)) { /* rank was specified but no slot list given - that's an error */ @@ -208,34 +240,32 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) goto error; } slots = rfmap->slot_list; - } + /* find the node where this proc was assigned */ + node = NULL; + OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) { + if (NULL != rfmap->node_name && + 0 == strcmp(nd->name, rfmap->node_name)) { + node = nd; + break; + } else if (NULL != rfmap->node_name && + (('+' == rfmap->node_name[0]) && + (('n' == rfmap->node_name[1]) || + ('N' == rfmap->node_name[1])))) { - /* find the node where this proc was assigned */ - node = NULL; - OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) { - if (NULL != rfmap->node_name && - 0 == strcmp(nd->name, rfmap->node_name)) { - node = nd; - break; - } else if (NULL != rfmap->node_name && - (('+' == rfmap->node_name[0]) && - (('n' == rfmap->node_name[1]) || - ('N' == rfmap->node_name[1])))) { - - relative_index=atoi(strtok(rfmap->node_name,"+n")); - if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){ - orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name); - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; + relative_index=atoi(strtok(rfmap->node_name,"+n")); + if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){ + orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name); + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + root_node = (orte_node_t*) opal_list_get_first(&node_list); + for(tmp_cnt=0; tmp_cntnode_name);