1
1

Cleanup a bug found by Josh that caused multiple app_contexts to keep mapping onto the first node in an allocation

Continue work on loadbalancing

Cleanup code organization in rmaps_base

This commit was SVN r18353.
Этот коммит содержится в:
Ralph Castain 2008-05-01 21:07:49 +00:00
родитель 843a35094f
Коммит 432d441b3e
3 изменённых файлов: 83 добавлений и 37 удалений

Просмотреть файл

@ -53,6 +53,8 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
* MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
* NULL MAP FIELD
* LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
* DIDN'T SET IT
*/
if (NULL == jdata->map) {
@ -73,6 +75,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
map->display_map = orte_rmaps_base.display_map;
/* assign the map object to this job */
jdata->map = map;
} else {
if (!map->display_map) {
map->display_map = orte_rmaps_base.display_map;
}
}
/* go ahead and map the job */

Просмотреть файл

@ -88,15 +88,15 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
return ORTE_ERR_SILENT;
}
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
return ORTE_ERR_SILENT;
}
/* did the app_context contain a hostfile? */
if (NULL != app->hostfile) {
@ -108,31 +108,33 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
/* now filter the list through any -host specification */
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
app->dash_host))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
if (NULL != app->dash_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
app->dash_host))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
}
}
/* If the "no local" option was set, then remove the local node
from the list */
* from the list
*/
if (policy & ORTE_RMAPS_NO_USE_LOCAL) {
for (item = opal_list_get_first(allocated_nodes);
item != opal_list_get_end(allocated_nodes);
@ -148,14 +150,14 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
break;
}
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:nolocal-no-available-resources", true);
return ORTE_ERR_SILENT;
}
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:nolocal-no-available-resources", true);
return ORTE_ERR_SILENT;
}
/* remove all nodes that are already at max usage, and
* compute the total number of available slots while

Просмотреть файл

@ -238,7 +238,7 @@ static int map_app_by_slot(
* OR we are at our ppn and loadbalancing, then break from the loop
*/
if (ORTE_ERR_NODE_FULLY_USED == rc ||
(orte_rmaps_base.loadbalance && i == ppn)) {
(orte_rmaps_base.loadbalance && (int)node->num_procs >= ppn)) {
break;
}
}
@ -250,7 +250,7 @@ static int map_app_by_slot(
*/
if (i < (num_slots_to_take-1) &&
ORTE_ERR_NODE_FULLY_USED != rc &&
(orte_rmaps_base.loadbalance && i != ppn)) {
(orte_rmaps_base.loadbalance && (int)node->num_procs < ppn)) {
continue;
}
cur_node_item = next;
@ -270,7 +270,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
orte_std_cntr_t i;
opal_list_t node_list;
opal_list_item_t *item;
orte_node_t *node, **nodes;
orte_node_t *node, **nodes, *nd1;
orte_vpid_t vpid_start;
orte_std_cntr_t num_nodes, num_slots;
int rc;
@ -290,11 +290,12 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
* if we are doing pernode or if #procs was not given
*/
if (orte_rmaps_base.loadbalance && !map->pernode) {
/* compute total #procs */
float res;
/* compute total #procs we are going to add */
for(i=0; i < jdata->num_apps; i++) {
app = apps[i];
if (0 == app->num_procs) {
/* can't do it - just move on */
/* can't do it - tell user and quit */
opal_show_help("help-orte-rmaps-rr.txt",
"orte-rmaps-rr:loadbalance-and-zero-np",
true);
@ -303,7 +304,9 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
}
ppn += app->num_procs;
}
/* get the total avail nodes */
/* get the total avail nodes and the number
* of procs already using them
*/
nodes = (orte_node_t**)orte_node_pool->addr;
num_nodes=0;
for (i=0; i < orte_node_pool->size; i++) {
@ -315,7 +318,11 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
}
}
/* compute the balance */
res = ((float)ppn / num_nodes);
ppn = ppn / num_nodes;
if (0 < (res-ppn)) {
ppn++;
}
}
/* cycle through the app_contexts, mapping them sequentially */
@ -368,6 +375,37 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
cur_node_item = opal_list_get_first(&node_list);
}
/* is this node oversubscribed? */
node = (orte_node_t*)cur_node_item;
if (node->slots_inuse > node->slots) {
/* work down the list - is there another node that
* would not be oversubscribed?
*/
if (cur_node_item != opal_list_get_end(&node_list)) {
item = opal_list_get_next(cur_node_item);
} else {
item = opal_list_get_first(&node_list);
}
while (item != cur_node_item) {
nd1 = (orte_node_t*)item;
if (nd1->slots_inuse < nd1->slots) {
/* this node is not oversubscribed! use it! */
cur_node_item = item;
goto proceed;
}
if (item == opal_list_get_end(&node_list)) {
item = opal_list_get_first(&node_list);
} else {
item= opal_list_get_next(item);
}
}
/* if we get here, then we cycled all the way around the
* list without finding a better answer - just use what
* we have
*/
}
proceed:
if (map->pernode && map->npernode == 1) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes