1
1
Fix a few bugs in the mappers:

1. Ensure that bynode with no -np fills all available slots - it just does so with the ranks set bynode instead of byslot

2. fix --nolocal behavior so it works correctly in all cases. We still have to test the host's name using opal_ifislocal in the mapper because the name returned by gethostname to orte_process_info.hostname can be an FQDN, but a hostfile may contain a non-FQDN version.

3. Add missing --nolocal logic to the seq mapper

Oversubscribed mapping seemed to be working okay without repair, so I couldn't verify my own bug report in that regard.

Also included are some preliminary changes to support the modified hostfile behavior, which will be committed shortly:

1. removed the totally useless "allocate" field in the orte_node_t object since every node is automatically allocated for use - and everything ignored the field anyway

2. correctly initialize the slots_alloc field when the allocation is read

This commit was SVN r19030.
Этот коммит содержится в:
Ralph Castain 2008-07-25 13:35:12 +00:00
родитель 31df89ccb2
Коммит a1d296ae03
11 изменённых файлов: 51 добавлений и 35 удалений

Просмотреть файл

@ -61,6 +61,12 @@ int orte_ras_base_allocate(orte_job_t *jdata)
"%s ras:base:allocate allocation already read",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* loop through the global node pool and set the
* number of allocated slots to the difference
* between slots and slots_in_use. Note that
* oversubscription will still allow procs to
* be mapped up to slots_max
*/
return ORTE_SUCCESS;
}

Просмотреть файл

@ -113,8 +113,6 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
/* use the local name for our node - don't trust what
* we got from an RM
*/
/* set the node to available for use */
hnp_node->allocate = true;
/* update the total slots in the job */
jdata->total_slots_alloc += hnp_node->slots;
/* don't keep duplicate copy */
@ -125,8 +123,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
"%s ras:base:node_insert node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == node->name) ? "NULL" : node->name));
/* set node to available for use */
node->allocate = true;
/* allocate all the available slots */
node->slots_alloc = node->slots;
/* insert it into the array */
node->index = opal_pointer_array_add(orte_node_pool, (void*)node);
if (ORTE_SUCCESS > (rc = node->index)) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -60,13 +60,11 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
if (NULL == nodes[i]) {
break; /* nodes are left aligned, so stop when we hit a null */
}
if (nodes[i]->allocate) {
/* retain a copy for our use in case the item gets
* destructed along the way
*/
OBJ_RETAIN(nodes[i]);
opal_list_append(allocated_nodes, &nodes[i]->super);
}
/* retain a copy for our use in case the item gets
* destructed along the way
*/
OBJ_RETAIN(nodes[i]);
opal_list_append(allocated_nodes, &nodes[i]->super);
}
/** check that anything is here */
@ -139,11 +137,12 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
item != opal_list_get_end(allocated_nodes);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* by this time, we have adjusted all local node
* names to be our node name, so we don't need
* to keep checking for that condition
/* need to check ifislocal because the name in the
* hostfile may not have been FQDN, while name returned
* by gethostname may have been (or vice versa)
*/
if (0 == strcmp(node->name, orte_process_info.nodename)) {
if (0 == strcmp(node->name, orte_process_info.nodename) ||
opal_ifislocal(node->name)) {
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
break;

Просмотреть файл

@ -74,9 +74,10 @@ orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = {
*/
static int orte_rmaps_rank_file_open(void)
{
mca_rmaps_rank_file_component.priority = 0;
int index = 0;
mca_rmaps_rank_file_component.priority = 0;
mca_base_param_reg_string(&mca_rmaps_rank_file_component.super.base_version,
"path",
"The path to the rank mapping file",

Просмотреть файл

@ -312,9 +312,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
if (NULL == nodes[i]) {
break; /* nodes are left aligned, so stop when we hit a null */
}
if (nodes[i]->allocate) {
num_nodes++;
}
num_nodes++;
}
/* compute the balance */
res = ((float)ppn / num_nodes);
@ -449,14 +447,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
goto error;
}
} else if (0 == app->num_procs) {
/** set the num_procs to equal the number of slots on these mapped nodes - if
user has specified "-bynode", then set it to the number of nodes
*/
if (map->policy & ORTE_RMAPS_BYNODE) {
app->num_procs = num_nodes;
} else if (map->policy & ORTE_RMAPS_BYSLOT) {
app->num_procs = num_slots;
} else if (map->policy & ORTE_RMAPS_BYUSER) {
if (map->policy & ORTE_RMAPS_BYUSER) {
/* we can't handle this - it should have been set when we got
* the map info. If it wasn't, then we can only error out
*/
@ -465,6 +456,8 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
rc = ORTE_ERR_SILENT;
goto error;
}
/** set the num_procs to equal the number of slots on these mapped nodes */
app->num_procs = num_slots;
}
/** track the total number of processes we mapped */

Просмотреть файл

@ -32,6 +32,7 @@
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/trace.h"
#include "opal/util/argv.h"
#include "opal/util/if.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
@ -106,6 +107,26 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
} else {
node_list = default_node_list;
}
/* check for nolocal and remove the head node, if required */
if (map->policy & ORTE_RMAPS_NO_USE_LOCAL) {
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* need to check ifislocal because the name in the
* hostfile may not have been FQDN, while name returned
* by gethostname may have been (or vice versa)
*/
if (0 == strcmp(node->name, orte_process_info.nodename) ||
opal_ifislocal(node->name)) {
opal_list_remove_item(node_list, item);
OBJ_RELEASE(item); /* "un-retain" it */
break;
}
}
}
if (NULL == node_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list))) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
@ -174,7 +195,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
if (NULL == nodes[j]) {
break; /* nodes are left aligned, so stop when we hit a null */
}
if (nodes[j]->allocate && 0 == strcmp(nd->name, nodes[j]->name)) {
if (0 == strcmp(nd->name, nodes[j]->name)) {
node = nodes[j];
break;
}

Просмотреть файл

@ -309,7 +309,7 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
return rc;
}
/* do not pack the allocate flag, daemon name, or launch id */
/* do not pack the daemon name or launch id */
/* pack the number of procs on the node */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,

Просмотреть файл

@ -292,8 +292,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
return ORTE_SUCCESS;
}
asprintf(&tmp, "\n%sData for node: Name: %s\tAllocate: %s\n%s\tLaunch id: %ld\tArch: %0x\tState: %0x",
pfx2, src->name, (src->allocate) ? "Yes" : "No",
asprintf(&tmp, "\n%sData for node: Name: %s\t%s\tLaunch id: %ld\tArch: %0x\tState: %0x",
pfx2, src->name,
pfx2, (long)src->launch_id,
src->arch, src->state);

Просмотреть файл

@ -331,7 +331,7 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
return rc;
}
/* do not unpack the allocate flag, daemon name, or launch id */
/* do not unpack the daemon name or launch id */
/* unpack the number of procs on the node */
n = 1;

Просмотреть файл

@ -127,8 +127,6 @@ typedef struct {
orte_std_cntr_t index;
/** String node name */
char *name;
/* whether or not this node is available for allocation */
bool allocate;
/* daemon on this node */
struct orte_proc_t *daemon;
/* whether or not this daemon has been launched */

Просмотреть файл

@ -180,7 +180,6 @@ OBJ_CLASS_INSTANCE(orte_job_t,
static void orte_node_construct(orte_node_t* node)
{
node->name = NULL;
node->allocate = false;
node->index = -1;
node->daemon = NULL;
node->daemon_launched = false;