1
1

Fix a bug in the new gpr match check function that caused some containers to be mismatched.

Add the logic to properly assign new cellid's to hosts read in by the hostfile component. However, don't turn it on yet.

It seems that the code base has (unfortunately) assumed that cellid is always zero. When I turn on the cellid capability, the system "hangs" whenever the cellid is non-zero. I'll have to chase that problem down. For now, I've turned "off" the cellid assignment in the hostfile component.

This commit was SVN r5865.
Этот коммит содержится в:
Ralph Castain 2005-05-25 16:23:13 +00:00
родитель ddc19805ab
Коммит 05a1982853
3 изменённых файлов: 18 добавлений и 7 удалений

Просмотреть файл

@ -68,7 +68,7 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
found_one = false;
for (i=0; i < num_itags_entry; i++) { /* for each container tag */
match = false;
for (j=0; j < num_itags_search; j++) { /* check all the search tags and see if it is present */
for (j=0; j < num_itags_search && !match; j++) { /* check all the search tags and see if it is present */
if (entry_itags[i] == itags[j]) { /* found a match */
if (ORTE_SUCCESS != (rc = orte_bitmap_set_bit(&(orte_gpr_replica_globals.srch_itag), itags[j]))) {
ORTE_ERROR_LOG(rc);
@ -76,7 +76,7 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
}
if (ORTE_GPR_REPLICA_OR & addr_mode) { /* only need one match */
if (not_set) return false;
else return true;
else return true;
}
match = true;
found_one = true;
@ -96,13 +96,13 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
* that we would have already returned in the OR case. So, first check the XOR
* case
*/
if (ORTE_GPR_REPLICA_XOR && found_one) {
if ((ORTE_GPR_REPLICA_XOR & addr_mode) && found_one) {
if (not_set) return false;
else return true;
}
/* Only thing we have left to check is AND */
/* check if all the search tags were found */
/* check if any search tag was not found */
for (i=0; i < num_itags_search; i++) {
if (1 != orte_bitmap_is_set_bit(&(orte_gpr_replica_globals.srch_itag), itags[i])) {
/* this tag was NOT found - required to find them all */
@ -110,7 +110,6 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
else return false;
}
}
/* okay, all the tags are there, so we now passed the AND test */
if (not_set) return false;
else return true;

Просмотреть файл

@ -19,6 +19,8 @@
#include <string.h>
#include "include/orte_constants.h"
#include "util/output.h"
#include "mca/errmgr/errmgr.h"
#include "mca/soh/soh_types.h"
#include "mca/gpr/gpr.h"
@ -236,7 +238,7 @@ int orte_ras_base_node_insert(ompi_list_t* nodes)
return ORTE_ERR_OUT_OF_RESOURCE;
}
value->addr_mode = ORTE_GPR_OVERWRITE;
value->addr_mode = ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND;
value->segment = strdup(ORTE_NODE_SEGMENT);
value->cnt = 6;
value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt*sizeof(orte_gpr_keyval_t*));
@ -392,7 +394,7 @@ int orte_ras_base_node_assign(ompi_list_t* nodes, orte_jobid_t jobid)
return ORTE_ERR_OUT_OF_RESOURCE;
}
values[i]->addr_mode = ORTE_GPR_OVERWRITE;
values[i]->addr_mode = ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND;
values[i]->segment = strdup(ORTE_NODE_SEGMENT);
values[i]->cnt = 1;
values[i]->keyvals = (orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t*));

Просмотреть файл

@ -26,6 +26,7 @@
#include "util/sys_info.h"
#include "mca/mca.h"
#include "mca/base/base.h"
#include "mca/ns/ns.h"
#include "mca/ras/base/ras_base_node.h"
#include "mca/errmgr/errmgr.h"
#include "rds_hostfile.h"
@ -84,6 +85,14 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
node = OBJ_NEW(orte_ras_base_node_t);
node->node_name = strdup(node_name);
node->node_slots = 1;
/* get a new cellid for this node */
#if 0
if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&(node->node_cellid),
"UNKNOWN-SITE", node->node_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
update++;
}
} else {
@ -246,6 +255,7 @@ cleanup:
while(NULL != (item = ompi_list_remove_first(&existing))) {
OBJ_RELEASE(item);
}
while(NULL != (item = ompi_list_remove_first(&updates))) {
OBJ_RELEASE(item);
}