More work with Ralph -- now we think we have it right. Here's the
additions from his previous commit: - Properly propagate error upwards if we have a losthost+other_node error - Added logic to handle multiple instances of the same hostname - Added logic to properly increment the slot count for multiple instances. For example, a hostfile with: foo.example.com foo.example.com slots=4 foo.example.com slots=8 would result in a single host with a slot count of 13 (i.e., if no slot count is specified, 1 is assumed) - Revised the localhost logic a bit -- some cases are ok (e.g., specifying localhost multiple times is ok, as long as there are no other hosts) This commit was SVN r5886.
Этот коммит содержится в:
родитель
fa8889bafa
Коммит
843cd2dbac
@ -75,7 +75,8 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
||||
{
|
||||
int rc;
|
||||
orte_ras_base_node_t* node;
|
||||
int update = 0;
|
||||
bool update = false;
|
||||
bool got_count = false;
|
||||
|
||||
if (ORTE_RDS_HOSTFILE_STRING == token) {
|
||||
char* node_name = orte_rds_hostfile_value.sval;
|
||||
@ -85,36 +86,72 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
||||
* vaguery of the "nodename" parameter returned by Linux system calls.
|
||||
* See the man page for uname for a detailed explanation
|
||||
*/
|
||||
if (strcmp(node_name, "localhost") == 0) {
|
||||
if (0 < ompi_list_get_size(updates)) {
|
||||
if (0 == strcmp(node_name, "localhost")) {
|
||||
|
||||
/* If the size of the updates list == 1 and it only
|
||||
contains localhost, or if the size of the updates list
|
||||
== 0, we're ok. Otherwise, this is an error. The
|
||||
positive logic test was a little clearer than a
|
||||
negative logic check, so even though this results in
|
||||
potentially re-setting localhost_found=true multiple
|
||||
times (if "localhost" is included multiple times in the
|
||||
file), the code is clearer this way. */
|
||||
|
||||
if (0 == ompi_list_get_size(updates) ||
|
||||
(1 == ompi_list_get_size(updates) && localhost_found)) {
|
||||
localhost_found = true;
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
localhost_found = true;
|
||||
} else if (localhost_found) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
/* Do we need to make a new node object? First check to see
|
||||
if it's in the existing list. */
|
||||
|
||||
if (NULL == (node = orte_rds_hostfile_lookup(existing, node_name))) {
|
||||
node = OBJ_NEW(orte_ras_base_node_t);
|
||||
node->node_name = strdup(node_name);
|
||||
node->node_slots = 1;
|
||||
/* get a new cellid for this node */
|
||||
|
||||
/* If it wasn't, see if it's already in the updates list */
|
||||
|
||||
if (NULL == (node = orte_rds_hostfile_lookup(updates,
|
||||
node_name))) {
|
||||
node = OBJ_NEW(orte_ras_base_node_t);
|
||||
node->node_name = strdup(node_name);
|
||||
node->node_slots = 0;
|
||||
|
||||
#if 0
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&(node->node_cellid),
|
||||
"UNKNOWN-SITE", node->node_name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* get a new cellid for this node */
|
||||
/* JMS Temporarily turned off until cell IDs are
|
||||
properly handled elsewhere in the code */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.create_cellid(&(node->node_cellid),
|
||||
"UNKNOWN-SITE",
|
||||
node->node_name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
update++;
|
||||
}
|
||||
|
||||
/* Note that we need to set update to true regardless of
|
||||
whether the node was found on the updates list or not.
|
||||
If it was found, we just removed it (in
|
||||
orte_rds_hostfile_lookup()), so the update puts it back
|
||||
(potentially after updating it, of course). If it was
|
||||
not found, then we have a new node instance that needs
|
||||
to be added to the updates list. */
|
||||
|
||||
update = true;
|
||||
}
|
||||
} else {
|
||||
orte_rds_hostfile_parse_error();
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
got_count = false;
|
||||
while (!orte_rds_hostfile_done) {
|
||||
token = orte_rds_hostfile_lex();
|
||||
switch (token) {
|
||||
@ -132,10 +169,10 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
||||
OBJ_RELEASE(node);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (node->node_slots != (size_t)rc) {
|
||||
node->node_slots = rc;
|
||||
update++;
|
||||
}
|
||||
node->node_slots += rc;
|
||||
update = true;
|
||||
got_count = true;
|
||||
|
||||
/* Ensure that node_slots_max >= node_slots */
|
||||
if (node->node_slots_max != 0 && node->node_slots_max < node->node_slots) {
|
||||
node->node_slots_max = node->node_slots;
|
||||
@ -152,7 +189,7 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
||||
if (((size_t) rc) >= node->node_slots) {
|
||||
if (node->node_slots_max != (size_t)rc) {
|
||||
node->node_slots_max = rc;
|
||||
update++;
|
||||
update = true;
|
||||
}
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
@ -169,7 +206,10 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
||||
}
|
||||
|
||||
done:
|
||||
if(update) {
|
||||
if (update) {
|
||||
if (!got_count) {
|
||||
++node->node_slots;
|
||||
}
|
||||
ompi_list_append(updates, &node->super);
|
||||
} else {
|
||||
OBJ_RELEASE(node);
|
||||
@ -259,6 +299,8 @@ static int orte_rds_hostfile_query(void)
|
||||
ompi_output(0, "orte_rds_hostfile: could not open %s\n", mca_rds_hostfile_component.path);
|
||||
}
|
||||
goto cleanup;
|
||||
} else if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
if(ompi_list_get_size(&updates)) {
|
||||
rc = orte_ras_base_node_insert(&updates);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user