diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index 7df04a9bb2..b031900164 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -23,6 +23,7 @@ #include "opal/util/output.h" #include "opal/util/argv.h" +#include "opal/util/if.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" @@ -111,7 +112,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) * first position since it is the first one entered. We need to check to see * if this node is the same as the HNP's node so we don't double-enter it */ - if (0 == strcmp(node->name, hnp_node->name)) { + if (0 == strcmp(node->name, hnp_node->name) || opal_ifislocal(node->name)) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert updating HNP info to %ld slots", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -124,6 +125,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) hnp_node->slots_alloc = node->slots_alloc; hnp_node->slots_max = node->slots_max; hnp_node->launch_id = node->launch_id; + /* use the RM's name for the node */ + free(hnp_node->name); + hnp_node->name = strdup(node->name); /* set the node to available for use */ hnp_node->allocate = true; /* update the total slots in the job */ diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index aeb421a553..9b149ef368 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -57,10 +57,12 @@ typedef struct { bool pernode; /** number of ppn for n_per_node mode */ int npernode; - /* do we not allow use of the localhost */ + /* do not allow use of the localhost */ bool no_use_local; /* display the map after it is computed */ bool display_map; + /* balance load across nodes */ + bool loadbalance; } orte_rmaps_base_t; /** diff --git a/orte/mca/rmaps/base/rmaps_base_open.c b/orte/mca/rmaps/base/rmaps_base_open.c index f6ff8e2cbc..b031c518bc 100644 --- a/orte/mca/rmaps/base/rmaps_base_open.c +++ b/orte/mca/rmaps/base/rmaps_base_open.c @@ -125,6 +125,16 @@ int orte_rmaps_base_open(void) orte_rmaps_base.oversubscribe = true; } + /* Do we want to loadbalance the job */ + param = mca_base_param_reg_int_name("rmaps", "base_loadbalance", + "Balance total number of procs across all allocated nodes", + false, false, (int)false, &value); + orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value); + /* if we are doing npernode or pernode, then we cannot loadbalance */ + if (orte_rmaps_base.pernode) { + orte_rmaps_base.loadbalance = false; + } + /* should we display the map after determining it? */ mca_base_param_reg_int_name("rmaps", "base_display_map", "Whether to display the process map after it is computed", diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index bb1dd0efb2..184f9024b0 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -88,18 +88,17 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr ORTE_ERROR_LOG(rc); return rc; } - } - - /** check that anything is here */ - if (0 == opal_list_get_size(allocated_nodes)) { - opal_show_help("help-orte-rmaps-base.txt", - "orte-rmaps-base:no-available-resources", - true); - return ORTE_ERR_SILENT; + /** check that anything is here */ + if (0 == opal_list_get_size(allocated_nodes)) { + opal_show_help("help-orte-rmaps-base.txt", + "orte-rmaps-base:no-available-resources", + true); + return ORTE_ERR_SILENT; + } } /* did the app_context contain a hostfile? */ - if (NULL != app->hostfile) { + if (NULL != app && NULL != app->hostfile) { /* yes - filter the node list through the file, removing * any nodes not found in the file */ @@ -108,27 +107,27 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr ORTE_ERROR_LOG(rc); return rc; } + /** check that anything is here */ + if (0 == opal_list_get_size(allocated_nodes)) { + opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node", + true, app->app, app->hostfile); + return ORTE_ERR_SILENT; + } } - /** check that anything is here */ - if (0 == opal_list_get_size(allocated_nodes)) { - opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node", - true, app->app, app->hostfile); - return ORTE_ERR_SILENT; - } - - /* now filter the list through any -host specification */ - if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes, - app->dash_host))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /** check that anything is left! */ - if (0 == opal_list_get_size(allocated_nodes)) { - opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node", - true, app->app, ""); - return ORTE_ERR_SILENT; + /* now filter the list through any -host specification */ + if (NULL != app) { + if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes, + app->dash_host))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /** check that anything is left! */ + if (0 == opal_list_get_size(allocated_nodes)) { + opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node", + true, app->app, ""); + return ORTE_ERR_SILENT; + } } /* If the "no local" option was set, then remove the local node diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 61e5e16de5..d4252292db 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -46,6 +46,7 @@ * Local variable */ static opal_list_item_t *cur_node_item = NULL; +static int ppn = 0; /* * Create a default mapping for the application, scheduling round @@ -228,10 +229,12 @@ static int map_app_by_slot( /* Update the number of procs allocated */ ++num_alloc; - /** if all the procs have been mapped OR we have fully used up this node, then - * break from the loop + /** if all the procs have been mapped OR we have fully used up this node + * OR we are at our ppn and loadbalancing, then break from the loop */ - if(num_alloc == app->num_procs || ORTE_ERR_NODE_FULLY_USED == rc) { + if (num_alloc == app->num_procs || + ORTE_ERR_NODE_FULLY_USED == rc || + (orte_rmaps_base.loadbalance && i == ppn)) { break; } } @@ -241,7 +244,9 @@ static int map_app_by_slot( * node is NOT max'd out * */ - if (i < (num_slots_to_take-1) && ORTE_ERR_NODE_FULLY_USED != rc) { + if (i < (num_slots_to_take-1) && + ORTE_ERR_NODE_FULLY_USED != rc && + i != ppn) { continue; } cur_node_item = next; @@ -261,7 +266,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_std_cntr_t i; opal_list_t node_list; opal_list_item_t *item; - orte_node_t *node; + orte_node_t *node, **nodes; orte_vpid_t vpid_start; orte_std_cntr_t num_nodes, num_slots; int rc; @@ -276,6 +281,39 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) /* start at the beginning... */ vpid_start = 0; + /* if loadbalancing is requested, then we need to compute + * the #procs/node - note that this cannot be done + * if we are doing pernode or if #procs was not given + */ + if (orte_rmaps_base.loadbalance && !map->pernode) { + /* compute total #procs */ + for(i=0; i < jdata->num_apps; i++) { + app = apps[i]; + if (0 == app->num_procs) { + /* can't do it - just move on */ + opal_show_help("help-orte-rmaps-rr.txt", + "orte-rmaps-rr:loadbalance-and-zero-np", + true); + rc = ORTE_ERR_SILENT; + goto error; + } + ppn += app->num_procs; + } + /* get the total avail nodes */ + nodes = (orte_node_t**)orte_node_pool->addr; + num_nodes=0; + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == nodes[i]) { + break; /* nodes are left aligned, so stop when we hit a null */ + } + if (nodes[i]->allocate) { + num_nodes++; + } + } + /* compute the balance */ + ppn = ppn / num_nodes; + } + /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->num_apps; i++) { app = apps[i]; @@ -387,7 +425,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) goto error; } } - + /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index b432f72f75..e9c9b6a593 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -215,6 +215,9 @@ static opal_cmd_line_init_t cmd_line_init[] = { { "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Nodes are not to be oversubscribed, even if the system supports such operation"}, + { "rmaps", "base", "loadbalance", '\0', "loadbalance", "loadbalance", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Balance total number of procs across all allocated nodes"}, { "rmaps", "base", "display_map", '\0', "display-map", "display-map", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Display the process map just before launch"},