1
1

The rest of the multi app_context fix. Remove the restriction on number of app_contexts that can have zero np specified as multiple mappers now support that use-case. Update the ranking algorithms to respect and track bookmarks. Ensure we properly set the oversubscribed flag on a per-node basis.

This commit was SVN r25578.
Этот коммит содержится в:
Ralph Castain 2011-12-06 17:28:29 +00:00
родитель d9c7764e9b
Коммит 90b7f2a7bf
8 изменённых файлов: 309 добавлений и 219 удалений

Просмотреть файл

@ -50,16 +50,20 @@
#if OPAL_HAVE_HWLOC
static int rank_span(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *nodes,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_job_map_t *map;
hwloc_obj_t obj;
int num_objs, i, j, n, rc;
int num_objs, i, j, rc;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t vpid;
int cnt;
opal_list_item_t *item;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: for job %s",
@ -83,12 +87,13 @@ static int rank_span(orte_job_t *jdata,
*/
map = jdata->map;
vpid = 0;
while (vpid < jdata->num_procs) {
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
vpid = jdata->num_procs;
cnt = 0;
while (cnt < app->num_procs) {
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
@ -97,7 +102,7 @@ static int rank_span(orte_job_t *jdata,
num_objs, node->name, (int)node->num_procs);
/* for each object */
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
cache_level, i, OPAL_HWLOC_AVAILABLE);
@ -105,7 +110,7 @@ static int rank_span(orte_job_t *jdata,
"mca:rmaps:rank_span: working object %d", i);
/* cycle thru the procs on this node */
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
@ -120,6 +125,10 @@ static int rank_span(orte_job_t *jdata,
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
continue;
}
/* protect against bozo case */
if (NULL == proc->locale) {
ORTE_ERROR_LOG(ORTE_ERROR);
@ -135,6 +144,7 @@ static int rank_span(orte_job_t *jdata,
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid));
proc->name.vpid = vpid++;
cnt++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
@ -146,6 +156,10 @@ static int rank_span(orte_job_t *jdata,
ORTE_ERROR_LOG(rc);
return rc;
}
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
/* move to next object */
break;
}
@ -157,16 +171,20 @@ static int rank_span(orte_job_t *jdata,
}
static int rank_fill(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *nodes,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_job_map_t *map;
hwloc_obj_t obj;
int num_objs, i, j, n, rc;
int num_objs, i, j, rc;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t vpid;
int cnt;
opal_list_item_t *item;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: for job %s",
@ -182,11 +200,12 @@ static int rank_fill(orte_job_t *jdata,
*/
map = jdata->map;
vpid = 0;
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
vpid = jdata->num_procs;
cnt = 0;
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
@ -195,7 +214,7 @@ static int rank_fill(orte_job_t *jdata,
num_objs, node->name, (int)node->num_procs);
/* for each object */
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
cache_level, i, OPAL_HWLOC_AVAILABLE);
@ -203,7 +222,7 @@ static int rank_fill(orte_job_t *jdata,
"mca:rmaps:rank_fill: working object %d", i);
/* cycle thru the procs on this node */
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
@ -217,6 +236,10 @@ static int rank_fill(orte_job_t *jdata,
/* ignore procs that are already assigned */
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
continue;
}
/* protect against bozo case */
if (NULL == proc->locale) {
@ -233,6 +256,7 @@ static int rank_fill(orte_job_t *jdata,
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
proc->name.vpid = vpid++;
cnt++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
@ -244,6 +268,10 @@ static int rank_fill(orte_job_t *jdata,
ORTE_ERROR_LOG(rc);
return rc;
}
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
}
}
}
@ -252,23 +280,27 @@ static int rank_fill(orte_job_t *jdata,
}
static int rank_by(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *nodes,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_job_map_t *map;
hwloc_obj_t obj;
int num_objs, i, j, n;
int num_objs, i, j;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t vpid;
int cnt;
opal_pointer_array_t objs;
bool all_done;
opal_list_item_t *item;
if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
return rank_span(jdata, target, cache_level);
return rank_span(jdata, app, nodes, target, cache_level);
} else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
return rank_fill(jdata, target, cache_level);
return rank_fill(jdata, app, nodes, target, cache_level);
}
/* if ranking is not spanned or filled, then we
@ -288,11 +320,12 @@ static int rank_by(orte_job_t *jdata,
opal_pointer_array_init(&objs, 2, INT_MAX, 2);
map = jdata->map;
vpid = 0;
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
vpid = jdata->num_procs;
cnt = 0;
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
cache_level, OPAL_HWLOC_AVAILABLE);
@ -318,14 +351,14 @@ static int rank_by(orte_job_t *jdata,
* algorithm, but this works for now.
*/
all_done = false;
while (!all_done && vpid < jdata->num_procs) {
while (!all_done && cnt < app->num_procs) {
all_done = true;
/* cycle across the objects */
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
/* find the next proc on this object */
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
@ -340,6 +373,10 @@ static int rank_by(orte_job_t *jdata,
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
continue;
}
/* ignore procs on other objects */
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
@ -348,6 +385,7 @@ static int rank_by(orte_job_t *jdata,
continue;
}
proc->name.vpid = vpid++;
cnt++;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
@ -359,6 +397,10 @@ static int rank_by(orte_job_t *jdata,
}
/* flag that one was mapped */
all_done = false;
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
/* move to next object */
break;
}
@ -373,32 +415,40 @@ static int rank_by(orte_job_t *jdata,
}
#endif
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *nodes)
{
orte_job_map_t *map;
orte_vpid_t vpid, cnt;
int i, j;
orte_vpid_t vpid;
int j, cnt;
orte_node_t *node;
orte_proc_t *proc, *ptr;
orte_proc_t *proc;
int rc;
opal_list_item_t *item;
map = jdata->map;
if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) ||
ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:base: computing vpids by node for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* bozo check */
if (0 == opal_list_get_size(nodes)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
/* assign the ranks round-robin across nodes - only one board/node
* at this time, so they are equivalent
*/
cnt=0;
vpid=0;
while (cnt < jdata->num_procs) {
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
vpid=jdata->num_procs;
while (cnt < app->num_procs) {
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
@ -407,29 +457,12 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
if (proc->name.jobid != jdata->jobid) {
continue;
}
if (ORTE_VPID_INVALID != proc->name.vpid) {
/* vpid was already assigned. Some mappers require that
* we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
* do the operation if required
*/
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we added it to the array, then account for
* it in our loop - otherwise don't as we would be
* double counting
*/
cnt++;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
continue;
}
/* find next available vpid */
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
ORTE_VPID_INVALID != ptr->name.vpid) {
vpid++;
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
@ -443,6 +476,10 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
return rc;
}
cnt++;
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
break; /* move on to next node */
}
}
@ -455,11 +492,12 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:base: computing vpids by slot for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
vpid = 0;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
vpid = jdata->num_procs;
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
@ -468,12 +506,11 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
continue;
}
if (ORTE_VPID_INVALID == proc->name.vpid) {
/* find the next available vpid */
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
ORTE_VPID_INVALID != ptr->name.vpid) {
vpid++;
}
proc->name.vpid = vpid++;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
@ -482,6 +519,10 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
/* track where the highest vpid landed - this is our
* new bookmark
*/
jdata->bookmark = node;
}
/* some mappers require that we insert the proc into the jdata->procs
* array, while others will have already done it - so check and
@ -503,7 +544,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by NUMA for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
@ -513,7 +554,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by socket for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
@ -523,7 +564,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by L3cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
return rc;
@ -533,7 +574,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by L2cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
return rc;
@ -543,7 +584,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by L1cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
return rc;
@ -553,7 +594,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by core for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
@ -563,7 +604,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: computing ranks by hwthread for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) {
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;

Просмотреть файл

@ -466,6 +466,11 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
cur_node_item = opal_list_get_first(node_list);
}
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s Starting bookmark at node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((orte_node_t*)cur_node_item)->name));
/* is this node fully subscribed? If so, then the first
* proc we assign will oversubscribe it, so let's look
* for another candidate
@ -506,12 +511,20 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
}
/* if we get here, then we cycled all the way around the
* list without finding a better answer - just use the node
* that is minimally overloaded
* that is minimally overloaded if it is better than
* what we already have
*/
cur_node_item = (opal_list_item_t*)ndmin;
if ((nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
cur_node_item = (opal_list_item_t*)ndmin;
}
}
process:
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s Starting at node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((orte_node_t*)cur_node_item)->name));
/* make life easier - put the bookmark at the top of the list,
* shifting everything above it to the end of the list while
* preserving order

Просмотреть файл

@ -62,7 +62,9 @@ ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
ORTE_DECLSPEC orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
orte_app_context_t *app,
opal_list_t *nodes);
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);

Просмотреть файл

@ -242,7 +242,10 @@ static int ppr_mapper(orte_job_t *jdata)
/* cycle across the nodes */
nprocs_mapped = 0;
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
for (item = opal_list_get_first(&node_list);
item != opal_list_get_end(&node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* bozo check */
if (NULL == node->topology) {
@ -339,9 +342,6 @@ static int ppr_mapper(orte_job_t *jdata)
node->oversubscribed = true;
}
/* update the number of procs in the job */
jdata->num_procs += node->num_procs;
/* if we haven't mapped all the procs, continue on to the
* next node
*/
@ -360,12 +360,23 @@ static int ppr_mapper(orte_job_t *jdata)
goto error;
}
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
/* track the total number of processes we mapped - must update
* this AFTER we compute vpids so that computation is done
* correctly
*/
jdata->num_procs += app->num_procs;
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
}
return ORTE_SUCCESS;
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {

Просмотреть файл

@ -857,7 +857,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -177,9 +177,21 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
goto error;
}
/* track the total number of processes we mapped */
/* compute vpids and add proc objects to the job - do this after
* each app_context so that the ranks within each context are
* contiguous
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* track the total number of processes we mapped - must update
* this value AFTER we compute vpids so that computation
* is done correctly
*/
jdata->num_procs += app->num_procs;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
@ -187,15 +199,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
/* compute vpids and add proc objects to the job - do this after
* each app_context so that the ranks within each context are
* contiguous
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;

Просмотреть файл

@ -57,33 +57,81 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
/* check to see if we can map all the procs */
if (num_slots < app->num_procs) {
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
oversubscribed = true;
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
/* first pass: map the number of procs to each node until we
* map all specified procs or use all allocated slots
*/
nprocs_mapped = 0;
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
* locale except at the node level
*/
if (NULL != node->topology) {
obj = hwloc_get_root_obj(node->topology);
}
#endif
if (node->slots_alloc == node->slots_inuse) {
continue;
}
num_procs_to_assign = node->slots_alloc - node->slots_inuse;
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
/* add this node to the map - do it only once */
if (!node->mapped) {
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
nprocs_mapped++;
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
}
}
/* map the number of procs to each node until we
* map all specified procs
if (nprocs_mapped == app->num_procs) {
/* we are done */
return ORTE_SUCCESS;
}
/* second pass: if we haven't mapped everyone yet, it is
* because we are oversubscribed. Figure out how many procs
* to add
*/
nprocs_mapped = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
balance = (float)(app->num_procs - nprocs_mapped) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - nprocs_mapped - (extra_procs_to_assign * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
extra_procs_to_assign++;
/* flag that we added one */
add_one = true;
}
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
@ -101,30 +149,8 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
--nxtra_nodes;
}
}
if (oversubscribed) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (0 == (node->slots_alloc - node->slots_inuse)) {
num_procs_to_assign = 1 + extra_procs_to_assign;
} else {
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
}
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
if (0 == i) {
/* add this node to the map - do it only once */
if (!node->mapped) {
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(rc);
return rc;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
}
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -132,13 +158,17 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
/* keep track of the node we last used */
jdata->bookmark = node;
}
/* release the node - the object will persist */
OBJ_RELEASE(node);
}
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
}
return ORTE_SUCCESS;
}
@ -167,7 +197,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs */
if (num_slots < app->num_procs) {
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -211,7 +241,9 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
nprocs_mapped = 0;
lag = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
/* get the root object as we are not assigning
@ -243,10 +275,6 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
if (oversubscribed) {
/* everybody just takes their share */
num_procs_to_assign = navg + extra_procs_to_assign;
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
} else {
/* if we are not oversubscribed, then there are enough
* slots to handle all the procs. However, not every
@ -254,14 +282,11 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (0 == (node->slots_alloc - node->slots_inuse)) {
if (node->slots_alloc == node->slots_inuse) {
/* if there are no extras to take, then we can
* safely remove this node as we don't need it
* ignore this node
*/
if (0 == extra_procs_to_assign) {
opal_pointer_array_set_item(jdata->map->nodes, idx, NULL);
OBJ_RELEASE(node);
--(jdata->map->num_nodes);
/* update how many we are lagging behind */
lag += navg;
continue;
@ -298,11 +323,16 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
#if OPAL_HAVE_HWLOC
proc->locale = obj;
#endif
/* keep track of the node we last used */
jdata->bookmark = node;
}
/* maintain acctg */
OBJ_RELEASE(node);
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
@ -372,7 +402,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < app->num_procs) {
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -380,11 +410,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
}
oversubscribed = true;
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
balance = (float)((jdata->num_procs + app->num_procs) - num_slots) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
nxtra_nodes = (jdata->num_procs + app->num_procs) - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
@ -400,7 +430,9 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
extra_procs_to_assign, nxtra_nodes);
nprocs_mapped = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
/* bozo check */
if (NULL == node->topology) {
@ -408,21 +440,18 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
true, node->name);
return ORTE_ERR_SILENT;
}
/* add this node to the map */
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
/* add this node to the map, if reqd */
if (!node->mapped) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
if (oversubscribed) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
/* compute the number of procs to go on this node */
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
--extra_procs_to_assign;
@ -431,7 +460,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
--nxtra_nodes;
}
}
if (0 == (node->slots_alloc - node->slots_inuse)) {
if (node->slots_alloc == node->slots_inuse) {
/* everybody takes at least the extras */
num_procs_to_assign = extra_procs_to_assign;
} else {
@ -473,11 +502,16 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
nprocs_mapped++;
proc->locale = obj;
}
/* keep track of the node we last used */
jdata->bookmark = node;
}
/* maintain acctg */
OBJ_RELEASE(node);
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;
@ -516,7 +550,7 @@ static int byobj_span(orte_job_t *jdata,
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < app->num_procs) {
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -542,11 +576,11 @@ static int byobj_span(orte_job_t *jdata,
/* compute how many extra procs to put on each node */
balance = (float)(app->num_procs - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
balance = (float)((jdata->num_procs + app->num_procs) - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
extra_procs_to_assign = (int)balance;
if (0 < (balance - (float)extra_procs_to_assign)) {
/* compute how many nodes need an extra proc */
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
nxtra_nodes = (jdata->num_procs + app->num_procs) - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
/* add one so that we add an extra proc to the first nodes
* until all procs are mapped
*/
@ -562,7 +596,9 @@ static int byobj_span(orte_job_t *jdata,
nprocs_mapped = 0;
lag = 0;
while (NULL != (item = opal_list_remove_first(node_list))) {
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
/* bozo check */
if (NULL == node->topology) {
@ -570,13 +606,16 @@ static int byobj_span(orte_job_t *jdata,
true, node->name);
return ORTE_ERR_SILENT;
}
/* add this node to the map */
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
/* add this node to the map, if reqd */
if (!node->mapped) {
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(idx);
return idx;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
/* compute the number of procs to go on this node */
if (add_one) {
if (0 == nxtra_nodes) {
@ -671,8 +710,15 @@ static int byobj_span(orte_job_t *jdata,
/* keep track of the node we last used */
jdata->bookmark = node;
}
/* maintain acctg */
OBJ_RELEASE(node);
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
node->oversubscribed = true;
}
if (nprocs_mapped == app->num_procs) {
/* we are done */
break;

Просмотреть файл

@ -104,7 +104,6 @@
*/
static orte_job_t *jdata=NULL;
static char **global_mca_env = NULL;
static bool have_zero_np = false;
static orte_std_cntr_t total_num_apps = 0;
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
static char *ompi_server=NULL;
@ -1646,31 +1645,6 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
app->num_procs = (orte_std_cntr_t)orterun_globals.num_procs;
/* If the user didn't specify the number of processes to run, then we
default to launching an app process using every slot. We can't do
anything about that here - we leave it to the RMAPS framework's
components to note this and deal with it later.
HOWEVER, we ONLY support this mode of operation if the number of
app_contexts is equal to ONE. If the user provides multiple applications,
we simply must have more information - in this case, generate an
error.
*/
if (app->num_procs == 0) {
have_zero_np = true; /** flag that we have a zero_np situation */
}
if (0 < total_num_apps && have_zero_np) {
/** we have more than one app and a zero_np - that's no good.
* note that we have to do this as a two step logic check since
* the user may fail to specify num_procs for the first app, but
* then give us another application.
*/
orte_show_help("help-orterun.txt", "orterun:multi-apps-and-zero-np",
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
}
total_num_apps++;
/* Preserve if we are to preload the binary */