Merge pull request #1250 from rhc54/topic/rf
Fix the default slot mapping in rank file mapper
Этот коммит содержится в:
Коммит
d9cd451a16
@ -196,10 +196,42 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
rank = vpid_start + k;
|
||||
/* get the rankfile entry for this rank */
|
||||
if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) {
|
||||
/* all ranks must be specified */
|
||||
orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
/* if we were give a default slot-list, then use it */
|
||||
if (NULL != opal_hwloc_base_slot_list) {
|
||||
slots = opal_hwloc_base_slot_list;
|
||||
/* take the next node off of the available list */
|
||||
node = NULL;
|
||||
OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
|
||||
/* if adding one to this node would oversubscribe it, then try
|
||||
* the next one */
|
||||
if (nd->slots <= (int)nd->num_procs) {
|
||||
continue;
|
||||
}
|
||||
/* take this one */
|
||||
node = nd;
|
||||
break;
|
||||
}
|
||||
if (NULL == node) {
|
||||
/* all would be oversubscribed, so take the least loaded one */
|
||||
k = UINT32_MAX;
|
||||
OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
|
||||
if (nd->num_procs < k) {
|
||||
k = nd->num_procs;
|
||||
node = nd;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if we still have nothing, then something is very wrong */
|
||||
if (NULL == node) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
/* all ranks must be specified */
|
||||
orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
if (0 == strlen(rfmap->slot_list)) {
|
||||
/* rank was specified but no slot list given - that's an error */
|
||||
@ -208,34 +240,32 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
slots = rfmap->slot_list;
|
||||
}
|
||||
/* find the node where this proc was assigned */
|
||||
node = NULL;
|
||||
OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
|
||||
if (NULL != rfmap->node_name &&
|
||||
0 == strcmp(nd->name, rfmap->node_name)) {
|
||||
node = nd;
|
||||
break;
|
||||
} else if (NULL != rfmap->node_name &&
|
||||
(('+' == rfmap->node_name[0]) &&
|
||||
(('n' == rfmap->node_name[1]) ||
|
||||
('N' == rfmap->node_name[1])))) {
|
||||
|
||||
/* find the node where this proc was assigned */
|
||||
node = NULL;
|
||||
OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
|
||||
if (NULL != rfmap->node_name &&
|
||||
0 == strcmp(nd->name, rfmap->node_name)) {
|
||||
node = nd;
|
||||
break;
|
||||
} else if (NULL != rfmap->node_name &&
|
||||
(('+' == rfmap->node_name[0]) &&
|
||||
(('n' == rfmap->node_name[1]) ||
|
||||
('N' == rfmap->node_name[1])))) {
|
||||
|
||||
relative_index=atoi(strtok(rfmap->node_name,"+n"));
|
||||
if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
|
||||
orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
relative_index=atoi(strtok(rfmap->node_name,"+n"));
|
||||
if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
|
||||
orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
root_node = (orte_node_t*) opal_list_get_first(&node_list);
|
||||
for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
|
||||
root_node = (orte_node_t*) opal_list_get_next(root_node);
|
||||
}
|
||||
node = root_node;
|
||||
break;
|
||||
}
|
||||
root_node = (orte_node_t*) opal_list_get_first(&node_list);
|
||||
for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
|
||||
root_node = (orte_node_t*) opal_list_get_next(root_node);
|
||||
}
|
||||
node = root_node;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
if (NULL == node) {
|
||||
orte_show_help("help-rmaps_rank_file.txt","bad-host", true, rfmap->node_name);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user