The rest of the multi app_context fix. Remove the restriction on number of app_contexts that can have zero np specified as multiple mappers now support that use-case. Update the ranking algorithms to respect and track bookmarks. Ensure we properly set the oversubscribed flag on a per-node basis.
This commit was SVN r25578.
Этот коммит содержится в:
родитель
d9c7764e9b
Коммит
90b7f2a7bf
@ -50,16 +50,20 @@
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
static int rank_span(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *nodes,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
hwloc_obj_t obj;
|
||||
int num_objs, i, j, n, rc;
|
||||
int num_objs, i, j, rc;
|
||||
orte_vpid_t num_ranked=0;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid;
|
||||
int cnt;
|
||||
opal_list_item_t *item;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: for job %s",
|
||||
@ -83,12 +87,13 @@ static int rank_span(orte_job_t *jdata,
|
||||
*/
|
||||
|
||||
map = jdata->map;
|
||||
vpid = 0;
|
||||
while (vpid < jdata->num_procs) {
|
||||
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
vpid = jdata->num_procs;
|
||||
cnt = 0;
|
||||
while (cnt < app->num_procs) {
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
/* get the number of objects - only consider those we can actually use */
|
||||
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
@ -97,7 +102,7 @@ static int rank_span(orte_job_t *jdata,
|
||||
num_objs, node->name, (int)node->num_procs);
|
||||
|
||||
/* for each object */
|
||||
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
|
||||
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
|
||||
cache_level, i, OPAL_HWLOC_AVAILABLE);
|
||||
|
||||
@ -105,7 +110,7 @@ static int rank_span(orte_job_t *jdata,
|
||||
"mca:rmaps:rank_span: working object %d", i);
|
||||
|
||||
/* cycle thru the procs on this node */
|
||||
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
|
||||
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
@ -120,6 +125,10 @@ static int rank_span(orte_job_t *jdata,
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
/* protect against bozo case */
|
||||
if (NULL == proc->locale) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
@ -135,6 +144,7 @@ static int rank_span(orte_job_t *jdata,
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid));
|
||||
proc->name.vpid = vpid++;
|
||||
cnt++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
@ -146,6 +156,10 @@ static int rank_span(orte_job_t *jdata,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* track where the highest vpid landed - this is our
|
||||
* new bookmark
|
||||
*/
|
||||
jdata->bookmark = node;
|
||||
/* move to next object */
|
||||
break;
|
||||
}
|
||||
@ -157,16 +171,20 @@ static int rank_span(orte_job_t *jdata,
|
||||
}
|
||||
|
||||
static int rank_fill(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *nodes,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
hwloc_obj_t obj;
|
||||
int num_objs, i, j, n, rc;
|
||||
int num_objs, i, j, rc;
|
||||
orte_vpid_t num_ranked=0;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid;
|
||||
int cnt;
|
||||
opal_list_item_t *item;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: for job %s",
|
||||
@ -182,11 +200,12 @@ static int rank_fill(orte_job_t *jdata,
|
||||
*/
|
||||
|
||||
map = jdata->map;
|
||||
vpid = 0;
|
||||
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
vpid = jdata->num_procs;
|
||||
cnt = 0;
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
/* get the number of objects - only consider those we can actually use */
|
||||
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
@ -195,7 +214,7 @@ static int rank_fill(orte_job_t *jdata,
|
||||
num_objs, node->name, (int)node->num_procs);
|
||||
|
||||
/* for each object */
|
||||
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
|
||||
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
|
||||
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
|
||||
cache_level, i, OPAL_HWLOC_AVAILABLE);
|
||||
|
||||
@ -203,7 +222,7 @@ static int rank_fill(orte_job_t *jdata,
|
||||
"mca:rmaps:rank_fill: working object %d", i);
|
||||
|
||||
/* cycle thru the procs on this node */
|
||||
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
|
||||
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
@ -217,6 +236,10 @@ static int rank_fill(orte_job_t *jdata,
|
||||
/* ignore procs that are already assigned */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
/* protect against bozo case */
|
||||
if (NULL == proc->locale) {
|
||||
@ -233,6 +256,7 @@ static int rank_fill(orte_job_t *jdata,
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
|
||||
proc->name.vpid = vpid++;
|
||||
cnt++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
|
||||
@ -244,6 +268,10 @@ static int rank_fill(orte_job_t *jdata,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* track where the highest vpid landed - this is our
|
||||
* new bookmark
|
||||
*/
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -252,23 +280,27 @@ static int rank_fill(orte_job_t *jdata,
|
||||
}
|
||||
|
||||
static int rank_by(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *nodes,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
hwloc_obj_t obj;
|
||||
int num_objs, i, j, n;
|
||||
int num_objs, i, j;
|
||||
orte_vpid_t num_ranked=0;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid;
|
||||
int cnt;
|
||||
opal_pointer_array_t objs;
|
||||
bool all_done;
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
|
||||
return rank_span(jdata, target, cache_level);
|
||||
return rank_span(jdata, app, nodes, target, cache_level);
|
||||
} else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
|
||||
return rank_fill(jdata, target, cache_level);
|
||||
return rank_fill(jdata, app, nodes, target, cache_level);
|
||||
}
|
||||
|
||||
/* if ranking is not spanned or filled, then we
|
||||
@ -288,11 +320,12 @@ static int rank_by(orte_job_t *jdata,
|
||||
opal_pointer_array_init(&objs, 2, INT_MAX, 2);
|
||||
|
||||
map = jdata->map;
|
||||
vpid = 0;
|
||||
for (n=0; n < map->nodes->size && vpid < jdata->num_procs; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
vpid = jdata->num_procs;
|
||||
cnt = 0;
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
/* get the number of objects - only consider those we can actually use */
|
||||
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
|
||||
cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
@ -318,14 +351,14 @@ static int rank_by(orte_job_t *jdata,
|
||||
* algorithm, but this works for now.
|
||||
*/
|
||||
all_done = false;
|
||||
while (!all_done && vpid < jdata->num_procs) {
|
||||
while (!all_done && cnt < app->num_procs) {
|
||||
all_done = true;
|
||||
/* cycle across the objects */
|
||||
for (i=0; i < num_objs && vpid < jdata->num_procs; i++) {
|
||||
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
|
||||
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
|
||||
|
||||
/* find the next proc on this object */
|
||||
for (j=0; j < node->procs->size && vpid < jdata->num_procs; j++) {
|
||||
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
@ -340,6 +373,10 @@ static int rank_by(orte_job_t *jdata,
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs on other objects */
|
||||
if (!hwloc_bitmap_intersects(obj->cpuset, proc->locale->cpuset)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
@ -348,6 +385,7 @@ static int rank_by(orte_job_t *jdata,
|
||||
continue;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
cnt++;
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
@ -359,6 +397,10 @@ static int rank_by(orte_job_t *jdata,
|
||||
}
|
||||
/* flag that one was mapped */
|
||||
all_done = false;
|
||||
/* track where the highest vpid landed - this is our
|
||||
* new bookmark
|
||||
*/
|
||||
jdata->bookmark = node;
|
||||
/* move to next object */
|
||||
break;
|
||||
}
|
||||
@ -373,32 +415,40 @@ static int rank_by(orte_job_t *jdata,
|
||||
}
|
||||
#endif
|
||||
|
||||
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *nodes)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_vpid_t vpid, cnt;
|
||||
int i, j;
|
||||
orte_vpid_t vpid;
|
||||
int j, cnt;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc, *ptr;
|
||||
orte_proc_t *proc;
|
||||
int rc;
|
||||
opal_list_item_t *item;
|
||||
|
||||
map = jdata->map;
|
||||
|
||||
|
||||
if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) ||
|
||||
ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:base: computing vpids by node for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
/* bozo check */
|
||||
if (0 == opal_list_get_size(nodes)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
/* assign the ranks round-robin across nodes - only one board/node
|
||||
* at this time, so they are equivalent
|
||||
*/
|
||||
cnt=0;
|
||||
vpid=0;
|
||||
while (cnt < jdata->num_procs) {
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
vpid=jdata->num_procs;
|
||||
while (cnt < app->num_procs) {
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
@ -407,29 +457,12 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
/* vpid was already assigned. Some mappers require that
|
||||
* we insert the proc into the jdata->procs
|
||||
* array, while others will have already done it - so check and
|
||||
* do the operation if required
|
||||
*/
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we added it to the array, then account for
|
||||
* it in our loop - otherwise don't as we would be
|
||||
* double counting
|
||||
*/
|
||||
cnt++;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
/* find next available vpid */
|
||||
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
|
||||
ORTE_VPID_INVALID != ptr->name.vpid) {
|
||||
vpid++;
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
continue;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
@ -443,6 +476,10 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
cnt++;
|
||||
/* track where the highest vpid landed - this is our
|
||||
* new bookmark
|
||||
*/
|
||||
jdata->bookmark = node;
|
||||
break; /* move on to next node */
|
||||
}
|
||||
}
|
||||
@ -455,11 +492,12 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:base: computing vpids by slot for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
vpid = 0;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
vpid = jdata->num_procs;
|
||||
for (item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
@ -468,12 +506,11 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_VPID_INVALID == proc->name.vpid) {
|
||||
/* find the next available vpid */
|
||||
while (NULL != (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid)) &&
|
||||
ORTE_VPID_INVALID != ptr->name.vpid) {
|
||||
vpid++;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
|
||||
@ -482,6 +519,10 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
/* track where the highest vpid landed - this is our
|
||||
* new bookmark
|
||||
*/
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
/* some mappers require that we insert the proc into the jdata->procs
|
||||
* array, while others will have already done it - so check and
|
||||
@ -503,7 +544,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by NUMA for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
@ -513,7 +554,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by socket for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
@ -523,7 +564,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by L3cache for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
@ -533,7 +574,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by L2cache for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
@ -543,7 +584,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by L1cache for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
@ -553,7 +594,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by core for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
@ -563,7 +604,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: computing ranks by hwthread for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) {
|
||||
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
|
@ -466,6 +466,11 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s Starting bookmark at node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
((orte_node_t*)cur_node_item)->name));
|
||||
|
||||
/* is this node fully subscribed? If so, then the first
|
||||
* proc we assign will oversubscribe it, so let's look
|
||||
* for another candidate
|
||||
@ -506,12 +511,20 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
}
|
||||
/* if we get here, then we cycled all the way around the
|
||||
* list without finding a better answer - just use the node
|
||||
* that is minimally overloaded
|
||||
* that is minimally overloaded if it is better than
|
||||
* what we already have
|
||||
*/
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
if ((nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
}
|
||||
|
||||
process:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s Starting at node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
((orte_node_t*)cur_node_item)->name));
|
||||
|
||||
/* make life easier - put the bookmark at the top of the list,
|
||||
* shifting everything above it to the end of the list while
|
||||
* preserving order
|
||||
|
@ -62,7 +62,9 @@ ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
|
||||
ORTE_DECLSPEC orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *nodes);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);
|
||||
|
||||
|
@ -242,7 +242,10 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
|
||||
/* cycle across the nodes */
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
|
||||
for (item = opal_list_get_first(&node_list);
|
||||
item != opal_list_get_end(&node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* bozo check */
|
||||
if (NULL == node->topology) {
|
||||
@ -339,9 +342,6 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs += node->num_procs;
|
||||
|
||||
/* if we haven't mapped all the procs, continue on to the
|
||||
* next node
|
||||
*/
|
||||
@ -360,12 +360,23 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/* track the total number of processes we mapped - must update
|
||||
* this AFTER we compute vpids so that computation is done
|
||||
* correctly
|
||||
*/
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
|
@ -857,7 +857,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -177,9 +177,21 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* track the total number of processes we mapped */
|
||||
/* compute vpids and add proc objects to the job - do this after
|
||||
* each app_context so that the ranks within each context are
|
||||
* contiguous
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* track the total number of processes we mapped - must update
|
||||
* this value AFTER we compute vpids so that computation
|
||||
* is done correctly
|
||||
*/
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
@ -187,15 +199,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
/* compute vpids and add proc objects to the job - do this after
|
||||
* each app_context so that the ranks within each context are
|
||||
* contiguous
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -57,33 +57,81 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
|
||||
|
||||
/* check to see if we can map all the procs */
|
||||
if (num_slots < app->num_procs) {
|
||||
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
oversubscribed = true;
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
extra_procs_to_assign++;
|
||||
/* flag that we added one */
|
||||
add_one = true;
|
||||
}
|
||||
|
||||
/* first pass: map the number of procs to each node until we
|
||||
* map all specified procs or use all allocated slots
|
||||
*/
|
||||
nprocs_mapped = 0;
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
*/
|
||||
if (NULL != node->topology) {
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
if (node->slots_alloc == node->slots_inuse) {
|
||||
continue;
|
||||
}
|
||||
num_procs_to_assign = node->slots_alloc - node->slots_inuse;
|
||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||
/* add this node to the map - do it only once */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* map the number of procs to each node until we
|
||||
* map all specified procs
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* second pass: if we haven't mapped everyone yet, it is
|
||||
* because we are oversubscribed. Figure out how many procs
|
||||
* to add
|
||||
*/
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
balance = (float)(app->num_procs - nprocs_mapped) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - nprocs_mapped - (extra_procs_to_assign * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
extra_procs_to_assign++;
|
||||
/* flag that we added one */
|
||||
add_one = true;
|
||||
}
|
||||
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
@ -101,30 +149,8 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (oversubscribed) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
num_procs_to_assign = 1 + extra_procs_to_assign;
|
||||
} else {
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
}
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||
if (0 == i) {
|
||||
/* add this node to the map - do it only once */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
}
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
@ -132,13 +158,17 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
/* release the node - the object will persist */
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -167,7 +197,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
(int)num_slots, (unsigned long)num_procs);
|
||||
|
||||
/* quick check to see if we can map all the procs */
|
||||
if (num_slots < app->num_procs) {
|
||||
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
@ -211,7 +241,9 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
|
||||
nprocs_mapped = 0;
|
||||
lag = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
@ -243,10 +275,6 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
if (oversubscribed) {
|
||||
/* everybody just takes their share */
|
||||
num_procs_to_assign = navg + extra_procs_to_assign;
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
} else {
|
||||
/* if we are not oversubscribed, then there are enough
|
||||
* slots to handle all the procs. However, not every
|
||||
@ -254,14 +282,11 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
if (node->slots_alloc == node->slots_inuse) {
|
||||
/* if there are no extras to take, then we can
|
||||
* safely remove this node as we don't need it
|
||||
* ignore this node
|
||||
*/
|
||||
if (0 == extra_procs_to_assign) {
|
||||
opal_pointer_array_set_item(jdata->map->nodes, idx, NULL);
|
||||
OBJ_RELEASE(node);
|
||||
--(jdata->map->num_nodes);
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
continue;
|
||||
@ -298,11 +323,16 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
break;
|
||||
@ -372,7 +402,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
* do more because we don't know how many total objects exist
|
||||
* across all the nodes
|
||||
*/
|
||||
if (num_slots < app->num_procs) {
|
||||
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
@ -380,11 +410,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
}
|
||||
oversubscribed = true;
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - num_slots) / (float)opal_list_get_size(node_list);
|
||||
balance = (float)((jdata->num_procs + app->num_procs) - num_slots) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
|
||||
nxtra_nodes = (jdata->num_procs + app->num_procs) - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
@ -400,7 +430,9 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
extra_procs_to_assign, nxtra_nodes);
|
||||
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
/* bozo check */
|
||||
if (NULL == node->topology) {
|
||||
@ -408,21 +440,18 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
true, node->name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* add this node to the map */
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
/* add this node to the map, if reqd */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
|
||||
if (oversubscribed) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
/* compute the number of procs to go on this node */
|
||||
/* compute the number of procs to go on this node */
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
--extra_procs_to_assign;
|
||||
@ -431,7 +460,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
if (node->slots_alloc == node->slots_inuse) {
|
||||
/* everybody takes at least the extras */
|
||||
num_procs_to_assign = extra_procs_to_assign;
|
||||
} else {
|
||||
@ -473,11 +502,16 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
nprocs_mapped++;
|
||||
proc->locale = obj;
|
||||
}
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
break;
|
||||
@ -516,7 +550,7 @@ static int byobj_span(orte_job_t *jdata,
|
||||
* do more because we don't know how many total objects exist
|
||||
* across all the nodes
|
||||
*/
|
||||
if (num_slots < app->num_procs) {
|
||||
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
@ -542,11 +576,11 @@ static int byobj_span(orte_job_t *jdata,
|
||||
|
||||
|
||||
/* compute how many extra procs to put on each node */
|
||||
balance = (float)(app->num_procs - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
|
||||
balance = (float)((jdata->num_procs + app->num_procs) - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list);
|
||||
extra_procs_to_assign = (int)balance;
|
||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||
/* compute how many nodes need an extra proc */
|
||||
nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
|
||||
nxtra_nodes = (jdata->num_procs + app->num_procs) - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list));
|
||||
/* add one so that we add an extra proc to the first nodes
|
||||
* until all procs are mapped
|
||||
*/
|
||||
@ -562,7 +596,9 @@ static int byobj_span(orte_job_t *jdata,
|
||||
|
||||
nprocs_mapped = 0;
|
||||
lag = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
/* bozo check */
|
||||
if (NULL == node->topology) {
|
||||
@ -570,13 +606,16 @@ static int byobj_span(orte_job_t *jdata,
|
||||
true, node->name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* add this node to the map */
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
/* add this node to the map, if reqd */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
/* compute the number of procs to go on this node */
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
@ -671,8 +710,15 @@ static int byobj_span(orte_job_t *jdata,
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
/* we are done */
|
||||
break;
|
||||
|
@ -104,7 +104,6 @@
|
||||
*/
|
||||
static orte_job_t *jdata=NULL;
|
||||
static char **global_mca_env = NULL;
|
||||
static bool have_zero_np = false;
|
||||
static orte_std_cntr_t total_num_apps = 0;
|
||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
||||
static char *ompi_server=NULL;
|
||||
@ -1646,31 +1645,6 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
|
||||
app->num_procs = (orte_std_cntr_t)orterun_globals.num_procs;
|
||||
|
||||
/* If the user didn't specify the number of processes to run, then we
|
||||
default to launching an app process using every slot. We can't do
|
||||
anything about that here - we leave it to the RMAPS framework's
|
||||
components to note this and deal with it later.
|
||||
|
||||
HOWEVER, we ONLY support this mode of operation if the number of
|
||||
app_contexts is equal to ONE. If the user provides multiple applications,
|
||||
we simply must have more information - in this case, generate an
|
||||
error.
|
||||
*/
|
||||
if (app->num_procs == 0) {
|
||||
have_zero_np = true; /** flag that we have a zero_np situation */
|
||||
}
|
||||
|
||||
if (0 < total_num_apps && have_zero_np) {
|
||||
/** we have more than one app and a zero_np - that's no good.
|
||||
* note that we have to do this as a two step logic check since
|
||||
* the user may fail to specify num_procs for the first app, but
|
||||
* then give us another application.
|
||||
*/
|
||||
orte_show_help("help-orterun.txt", "orterun:multi-apps-and-zero-np",
|
||||
true, orte_basename, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
total_num_apps++;
|
||||
|
||||
/* Preserve if we are to preload the binary */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user