1
1

Cleanup a bug found by Josh that caused multiple app_contexts to keep mapping onto the first node in an allocation

Continue work on loadbalancing

Cleanup code organization in rmaps_base

This commit was SVN r18353.
Этот коммит содержится в:
Ralph Castain 2008-05-01 21:07:49 +00:00
родитель 843a35094f
Коммит 432d441b3e
3 изменённых файлов: 83 добавлений и 37 удалений

Просмотреть файл

@ -53,6 +53,8 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
* MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
* NULL MAP FIELD * NULL MAP FIELD
* LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
* DIDN'T SET IT
*/ */
if (NULL == jdata->map) { if (NULL == jdata->map) {
@ -73,6 +75,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
map->display_map = orte_rmaps_base.display_map; map->display_map = orte_rmaps_base.display_map;
/* assign the map object to this job */ /* assign the map object to this job */
jdata->map = map; jdata->map = map;
} else {
if (!map->display_map) {
map->display_map = orte_rmaps_base.display_map;
}
} }
/* go ahead and map the job */ /* go ahead and map the job */

Просмотреть файл

@ -88,15 +88,15 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
return ORTE_ERR_SILENT;
}
} }
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
return ORTE_ERR_SILENT;
}
/* did the app_context contain a hostfile? */ /* did the app_context contain a hostfile? */
if (NULL != app->hostfile) { if (NULL != app->hostfile) {
@ -108,31 +108,33 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
} }
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
/* now filter the list through any -host specification */ /* now filter the list through any -host specification */
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes, if (NULL != app->dash_host) {
app->dash_host))) { if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
ORTE_ERROR_LOG(rc); app->dash_host))) {
return rc; ORTE_ERROR_LOG(rc);
} return rc;
}
/** check that anything is left! */ /** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) { if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node", opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, ""); true, app->app, "");
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
}
} }
/* If the "no local" option was set, then remove the local node /* If the "no local" option was set, then remove the local node
from the list */ * from the list
*/
if (policy & ORTE_RMAPS_NO_USE_LOCAL) { if (policy & ORTE_RMAPS_NO_USE_LOCAL) {
for (item = opal_list_get_first(allocated_nodes); for (item = opal_list_get_first(allocated_nodes);
item != opal_list_get_end(allocated_nodes); item != opal_list_get_end(allocated_nodes);
@ -148,14 +150,14 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
break; break;
} }
} }
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:nolocal-no-available-resources", true);
return ORTE_ERR_SILENT;
}
} }
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:nolocal-no-available-resources", true);
return ORTE_ERR_SILENT;
}
/* remove all nodes that are already at max usage, and /* remove all nodes that are already at max usage, and
* compute the total number of available slots while * compute the total number of available slots while

Просмотреть файл

@ -238,7 +238,7 @@ static int map_app_by_slot(
* OR we are at our ppn and loadbalancing, then break from the loop * OR we are at our ppn and loadbalancing, then break from the loop
*/ */
if (ORTE_ERR_NODE_FULLY_USED == rc || if (ORTE_ERR_NODE_FULLY_USED == rc ||
(orte_rmaps_base.loadbalance && i == ppn)) { (orte_rmaps_base.loadbalance && (int)node->num_procs >= ppn)) {
break; break;
} }
} }
@ -250,7 +250,7 @@ static int map_app_by_slot(
*/ */
if (i < (num_slots_to_take-1) && if (i < (num_slots_to_take-1) &&
ORTE_ERR_NODE_FULLY_USED != rc && ORTE_ERR_NODE_FULLY_USED != rc &&
(orte_rmaps_base.loadbalance && i != ppn)) { (orte_rmaps_base.loadbalance && (int)node->num_procs < ppn)) {
continue; continue;
} }
cur_node_item = next; cur_node_item = next;
@ -270,7 +270,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
orte_std_cntr_t i; orte_std_cntr_t i;
opal_list_t node_list; opal_list_t node_list;
opal_list_item_t *item; opal_list_item_t *item;
orte_node_t *node, **nodes; orte_node_t *node, **nodes, *nd1;
orte_vpid_t vpid_start; orte_vpid_t vpid_start;
orte_std_cntr_t num_nodes, num_slots; orte_std_cntr_t num_nodes, num_slots;
int rc; int rc;
@ -290,11 +290,12 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
* if we are doing pernode or if #procs was not given * if we are doing pernode or if #procs was not given
*/ */
if (orte_rmaps_base.loadbalance && !map->pernode) { if (orte_rmaps_base.loadbalance && !map->pernode) {
/* compute total #procs */ float res;
/* compute total #procs we are going to add */
for(i=0; i < jdata->num_apps; i++) { for(i=0; i < jdata->num_apps; i++) {
app = apps[i]; app = apps[i];
if (0 == app->num_procs) { if (0 == app->num_procs) {
/* can't do it - just move on */ /* can't do it - tell user and quit */
opal_show_help("help-orte-rmaps-rr.txt", opal_show_help("help-orte-rmaps-rr.txt",
"orte-rmaps-rr:loadbalance-and-zero-np", "orte-rmaps-rr:loadbalance-and-zero-np",
true); true);
@ -303,7 +304,9 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} }
ppn += app->num_procs; ppn += app->num_procs;
} }
/* get the total avail nodes */ /* get the total avail nodes and the number
* of procs already using them
*/
nodes = (orte_node_t**)orte_node_pool->addr; nodes = (orte_node_t**)orte_node_pool->addr;
num_nodes=0; num_nodes=0;
for (i=0; i < orte_node_pool->size; i++) { for (i=0; i < orte_node_pool->size; i++) {
@ -315,7 +318,11 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} }
} }
/* compute the balance */ /* compute the balance */
res = ((float)ppn / num_nodes);
ppn = ppn / num_nodes; ppn = ppn / num_nodes;
if (0 < (res-ppn)) {
ppn++;
}
} }
/* cycle through the app_contexts, mapping them sequentially */ /* cycle through the app_contexts, mapping them sequentially */
@ -368,6 +375,37 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
cur_node_item = opal_list_get_first(&node_list); cur_node_item = opal_list_get_first(&node_list);
} }
/* is this node oversubscribed? */
node = (orte_node_t*)cur_node_item;
if (node->slots_inuse > node->slots) {
/* work down the list - is there another node that
* would not be oversubscribed?
*/
if (cur_node_item != opal_list_get_end(&node_list)) {
item = opal_list_get_next(cur_node_item);
} else {
item = opal_list_get_first(&node_list);
}
while (item != cur_node_item) {
nd1 = (orte_node_t*)item;
if (nd1->slots_inuse < nd1->slots) {
/* this node is not oversubscribed! use it! */
cur_node_item = item;
goto proceed;
}
if (item == opal_list_get_end(&node_list)) {
item = opal_list_get_first(&node_list);
} else {
item= opal_list_get_next(item);
}
}
/* if we get here, then we cycled all the way around the
* list without finding a better answer - just use what
* we have
*/
}
proceed:
if (map->pernode && map->npernode == 1) { if (map->pernode && map->npernode == 1) {
/* there are three use-cases that we need to deal with: /* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes * (a) if -np was not provided, then we just use the number of nodes