Restore the prior default rank assignment scheme for round-robin mappers. Ensure that each app_context has sequential vpids.
This commit was SVN r22048.
Этот коммит содержится в:
родитель
c8c3132605
Коммит
dcab61ad83
@ -128,8 +128,10 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
int rc=ORTE_SUCCESS;
|
||||
int i;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
opal_list_item_t *next;
|
||||
orte_vpid_t num_alloc = 0;
|
||||
orte_vpid_t start;
|
||||
int num_procs_to_assign, num_possible_procs;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
@ -140,6 +142,8 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
If we still have processes that haven't been mapped yet, then it's an
|
||||
"out of resources" error. */
|
||||
|
||||
start = jdata->num_procs;
|
||||
|
||||
while ( num_alloc < num_procs) {
|
||||
/** see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
@ -211,10 +215,11 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
}
|
||||
|
||||
for( i = 0; i < num_procs_to_assign; ++i) {
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
node_list, jdata->map->oversubscribe,
|
||||
true, NULL))) {
|
||||
true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -226,6 +231,9 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
}
|
||||
}
|
||||
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = start++;
|
||||
|
||||
/* Update the number of procs allocated */
|
||||
++num_alloc;
|
||||
|
||||
@ -265,7 +273,9 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
int rc = ORTE_SUCCESS;
|
||||
opal_list_item_t *next;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t num_alloc=0;
|
||||
orte_vpid_t start;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
@ -284,6 +294,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
list, oversubscription is automatically taken care of via this logic.
|
||||
*/
|
||||
|
||||
start = jdata->num_procs;
|
||||
|
||||
while (num_alloc < num_procs) {
|
||||
/** see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
@ -307,8 +319,9 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
|
||||
node_list, jdata->map->oversubscribe, true, NULL))) {
|
||||
node_list, jdata->map->oversubscribe, true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -320,6 +333,10 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
}
|
||||
}
|
||||
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = start++;
|
||||
|
||||
/* Update the number of procs allocated */
|
||||
++num_alloc;
|
||||
|
||||
cur_node_item = next;
|
||||
|
@ -387,7 +387,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_vpid_t vpid, vpid_start=0;
|
||||
orte_vpid_t vpid, vpid_start;
|
||||
int i, j;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
@ -395,28 +395,31 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
|
||||
map = jdata->map;
|
||||
|
||||
if (ORTE_MAPPING_BYUSER & map->policy) {
|
||||
/* find the max vpid already assigned */
|
||||
vpid_start = ORTE_VPID_MIN;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
/* find the max vpid already assigned */
|
||||
vpid_start = ORTE_VPID_MIN;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
/* if the vpid is already defined, then update start */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid &&
|
||||
vpid_start < proc->name.vpid) {
|
||||
vpid_start = proc->name.vpid;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
continue;
|
||||
}
|
||||
/* if the vpid is already defined, then update start */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid &&
|
||||
vpid_start < proc->name.vpid) {
|
||||
vpid_start = proc->name.vpid;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ORTE_VPID_MIN == vpid_start) {
|
||||
/* start at zero */
|
||||
vpid_start = 0;
|
||||
} else {
|
||||
/* we start one higher than the max found */
|
||||
vpid_start++;
|
||||
}
|
||||
|
@ -68,12 +68,6 @@ static int switchyard(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -166,6 +160,14 @@ static int npernode(orte_job_t *jdata)
|
||||
"npernode", orte_rmaps_base.npernode);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
@ -253,6 +255,14 @@ static int nperboard(orte_job_t *jdata)
|
||||
"nperboard", orte_rmaps_base.nperboard);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
@ -345,6 +355,14 @@ static int npersocket(orte_job_t *jdata)
|
||||
"npersocket", orte_rmaps_base.npersocket);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
@ -480,6 +498,14 @@ static int loadbalance(orte_job_t *jdata)
|
||||
"number of nodes", num_nodes);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
/* record the number of procs */
|
||||
jdata->num_procs = total_procs;
|
||||
|
@ -478,6 +478,15 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* compute vpids and add proc objects to the job - this has to be
|
||||
* done after each app_context is mapped in order to keep the
|
||||
* vpids contiguous within an app_context
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* track number of procs */
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
@ -490,12 +499,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -92,9 +92,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
app->num_procs = num_slots;
|
||||
}
|
||||
|
||||
/* track the total number of processes we mapped */
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
/* Make assignments */
|
||||
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
|
||||
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
|
||||
@ -108,6 +105,9 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* track the total number of processes we mapped */
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user