1
1

Ensure pruning remains within the current job/app being mapped.

This commit was SVN r25397.
Этот коммит содержится в:
Ralph Castain 2011-10-30 00:02:20 +00:00
родитель 648c85b41b
Коммит 4232115a98

Просмотреть файл

@ -35,8 +35,11 @@ orte_rmaps_base_module_t orte_rmaps_ppr_module = {
ppr ppr
}; };
static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node); static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node,
static void prune(orte_node_t *node, orte_app_idx_t idx);
static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx,
orte_node_t *node,
opal_hwloc_level_t *level, opal_hwloc_level_t *level,
orte_vpid_t *nmapped); orte_vpid_t *nmapped);
@ -56,7 +59,7 @@ static int ppr(orte_job_t *jdata)
opal_list_item_t *item; opal_list_item_t *item;
orte_std_cntr_t num_slots; orte_std_cntr_t num_slots;
unsigned int nobjs, i; unsigned int nobjs, i;
orte_std_cntr_t idx; orte_app_idx_t idx;
/* only handle initial launch of loadbalanced /* only handle initial launch of loadbalanced
* or NPERxxx jobs - allow restarting of failed apps * or NPERxxx jobs - allow restarting of failed apps
@ -100,7 +103,7 @@ static int ppr(orte_job_t *jdata)
cache_level = 1; cache_level = 1;
} }
for (idx=0; idx < jdata->apps->size; idx++) { for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
continue; continue;
} }
@ -138,7 +141,7 @@ static int ppr(orte_job_t *jdata)
*/ */
if (HWLOC_OBJ_MACHINE == lowest) { if (HWLOC_OBJ_MACHINE == lowest) {
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) { for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node))) { if (NULL == (proc = setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE; rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error; goto error;
} }
@ -159,7 +162,7 @@ static int ppr(orte_job_t *jdata)
lowest, cache_level, lowest, cache_level,
i, OPAL_HWLOC_AVAILABLE); i, OPAL_HWLOC_AVAILABLE);
for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) { for (j=0; j < local_limit && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = setup_proc(jdata, node))) { if (NULL == (proc = setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE; rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error; goto error;
} }
@ -174,7 +177,7 @@ static int ppr(orte_job_t *jdata)
* node as we go * node as we go
*/ */
level--; level--;
prune(node, &level, &nprocs_mapped); prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
} }
} }
@ -260,7 +263,9 @@ static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
/* recursively climb the topology, pruning procs beyond that allowed /* recursively climb the topology, pruning procs beyond that allowed
* by the given ppr * by the given ppr
*/ */
static void prune(orte_node_t *node, static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx,
orte_node_t *node,
opal_hwloc_level_t *level, opal_hwloc_level_t *level,
orte_vpid_t *nmapped) orte_vpid_t *nmapped)
{ {
@ -293,7 +298,7 @@ static void prune(orte_node_t *node,
return; return;
} }
*level -= 1; *level -= 1;
prune(node, level, nmapped); prune(jobid, app_idx, node, level, nmapped);
return; return;
} }
@ -322,7 +327,7 @@ static void prune(orte_node_t *node,
avail = opal_hwloc_base_get_available_cpus(node->topology, obj); avail = opal_hwloc_base_get_available_cpus(node->topology, obj);
/* look at the intersection of this object's cpuset and that /* look at the intersection of this object's cpuset and that
* of each proc - if they intersect, then count this proc * of each proc in the job/app - if they intersect, then count this proc
* against the limit * against the limit
*/ */
nprocs = 0; nprocs = 0;
@ -330,6 +335,10 @@ static void prune(orte_node_t *node,
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue; continue;
} }
if (proc->name.jobid != jobid ||
proc->app_idx != app_idx) {
continue;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
if (hwloc_bitmap_intersects(avail, cpus)) { if (hwloc_bitmap_intersects(avail, cpus)) {
nprocs++; nprocs++;
@ -375,6 +384,10 @@ static void prune(orte_node_t *node,
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue; continue;
} }
if (proc->name.jobid != jobid ||
proc->app_idx != app_idx) {
continue;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
if (hwloc_bitmap_intersects(childcpus, cpus)) { if (hwloc_bitmap_intersects(childcpus, cpus)) {
nunder++; nunder++;
@ -414,14 +427,16 @@ static void prune(orte_node_t *node,
return; return;
} }
*level -= 1; *level -= 1;
prune(node, level, nmapped); prune(jobid, app_idx, node, level, nmapped);
return; return;
error: error:
opal_output(0, "INFINITE LOOP"); opal_output(0, "INFINITE LOOP");
} }
static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node) static orte_proc_t* setup_proc(orte_job_t *jdata,
orte_node_t *node,
orte_app_idx_t idx)
{ {
orte_proc_t *proc; orte_proc_t *proc;
int rc; int rc;
@ -435,7 +450,7 @@ static orte_proc_t* setup_proc(orte_job_t *jdata, orte_node_t *node)
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
/* flag the proc as ready for launch */ /* flag the proc as ready for launch */
proc->state = ORTE_PROC_STATE_INIT; proc->state = ORTE_PROC_STATE_INIT;
proc->app_idx = 0; proc->app_idx = idx;
OBJ_RETAIN(node); /* maintain accounting on object */ OBJ_RETAIN(node); /* maintain accounting on object */
proc->node = node; proc->node = node;