1
1

Restore the prior default rank assignment scheme for round-robin mappers. Ensure that each app_context has sequential vpids.

This commit was SVN r22048.
Этот коммит содержится в:
Ralph Castain 2009-10-02 03:16:18 +00:00
родитель c8c3132605
Коммит dcab61ad83
5 изменённых файлов: 85 добавлений и 36 удалений

Просмотреть файл

@ -128,8 +128,10 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
int rc=ORTE_SUCCESS;
int i;
orte_node_t *node;
orte_proc_t *proc;
opal_list_item_t *next;
orte_vpid_t num_alloc = 0;
orte_vpid_t start;
int num_procs_to_assign, num_possible_procs;
/* This loop continues until all procs have been mapped or we run
@ -140,6 +142,8 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
If we still have processes that haven't been mapped yet, then it's an
"out of resources" error. */
start = jdata->num_procs;
while ( num_alloc < num_procs) {
/** see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
@ -211,10 +215,11 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
}
for( i = 0; i < num_procs_to_assign; ++i) {
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
node_list, jdata->map->oversubscribe,
true, NULL))) {
true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -226,6 +231,9 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
}
}
/* assign the vpid */
proc->name.vpid = start++;
/* Update the number of procs allocated */
++num_alloc;
@ -265,7 +273,9 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
int rc = ORTE_SUCCESS;
opal_list_item_t *next;
orte_node_t *node;
orte_proc_t *proc;
orte_vpid_t num_alloc=0;
orte_vpid_t start;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
@ -284,6 +294,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
list, oversubscription is automatically taken care of via this logic.
*/
start = jdata->num_procs;
while (num_alloc < num_procs) {
/** see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
@ -307,8 +319,9 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
node_list, jdata->map->oversubscribe, true, NULL))) {
node_list, jdata->map->oversubscribe, true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -320,6 +333,10 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
}
}
/* assign the vpid */
proc->name.vpid = start++;
/* Update the number of procs allocated */
++num_alloc;
cur_node_item = next;

Просмотреть файл

@ -387,7 +387,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_vpid_t vpid, vpid_start=0;
orte_vpid_t vpid, vpid_start;
int i, j;
orte_node_t *node;
orte_proc_t *proc;
@ -395,28 +395,31 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
map = jdata->map;
if (ORTE_MAPPING_BYUSER & map->policy) {
/* find the max vpid already assigned */
vpid_start = ORTE_VPID_MIN;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
/* find the max vpid already assigned */
vpid_start = ORTE_VPID_MIN;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* if the vpid is already defined, then update start */
if (ORTE_VPID_INVALID != proc->name.vpid &&
vpid_start < proc->name.vpid) {
vpid_start = proc->name.vpid;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* if the vpid is already defined, then update start */
if (ORTE_VPID_INVALID != proc->name.vpid &&
vpid_start < proc->name.vpid) {
vpid_start = proc->name.vpid;
}
}
}
if (ORTE_VPID_MIN == vpid_start) {
/* start at zero */
vpid_start = 0;
} else {
/* we start one higher than the max found */
vpid_start++;
}

Просмотреть файл

@ -68,12 +68,6 @@ static int switchyard(orte_job_t *jdata)
return rc;
}
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
@ -166,6 +160,14 @@ static int npernode(orte_job_t *jdata)
"npernode", orte_rmaps_base.npernode);
return ORTE_ERR_SILENT;
}
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
jdata->num_procs = total_procs;
@ -253,6 +255,14 @@ static int nperboard(orte_job_t *jdata)
"nperboard", orte_rmaps_base.nperboard);
return ORTE_ERR_SILENT;
}
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
jdata->num_procs = total_procs;
@ -345,6 +355,14 @@ static int npersocket(orte_job_t *jdata)
"npersocket", orte_rmaps_base.npersocket);
return ORTE_ERR_SILENT;
}
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
jdata->num_procs = total_procs;
@ -480,6 +498,14 @@ static int loadbalance(orte_job_t *jdata)
"number of nodes", num_nodes);
return ORTE_ERR_SILENT;
}
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* record the number of procs */
jdata->num_procs = total_procs;

Просмотреть файл

@ -478,6 +478,15 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
}
cleanup:
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* track number of procs */
jdata->num_procs += app->num_procs;
@ -490,12 +499,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
OBJ_DESTRUCT(&node_list);
}
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -92,9 +92,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
app->num_procs = num_slots;
}
/* track the total number of processes we mapped */
jdata->num_procs += app->num_procs;
/* Make assignments */
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
@ -108,6 +105,9 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
goto error;
}
/* track the total number of processes we mapped */
jdata->num_procs += app->num_procs;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/