Fix a bug in the new gpr match check function that caused some containers to be mismatched.
Add the logic to properly assign new cellid's to hosts read in by the hostfile component. However, don't turn it on yet. It seems that the code base has (unfortunately) assumed that cellid is always zero. When I turn on the cellid capability, the system "hangs" whenever the cellid is non-zero. I'll have to chase that problem down. For now, I've turned "off" the cellid assignment in the hostfile component. This commit was SVN r5865.
Этот коммит содержится в:
родитель
ddc19805ab
Коммит
05a1982853
@ -68,7 +68,7 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
|
|||||||
found_one = false;
|
found_one = false;
|
||||||
for (i=0; i < num_itags_entry; i++) { /* for each container tag */
|
for (i=0; i < num_itags_entry; i++) { /* for each container tag */
|
||||||
match = false;
|
match = false;
|
||||||
for (j=0; j < num_itags_search; j++) { /* check all the search tags and see if it is present */
|
for (j=0; j < num_itags_search && !match; j++) { /* check all the search tags and see if it is present */
|
||||||
if (entry_itags[i] == itags[j]) { /* found a match */
|
if (entry_itags[i] == itags[j]) { /* found a match */
|
||||||
if (ORTE_SUCCESS != (rc = orte_bitmap_set_bit(&(orte_gpr_replica_globals.srch_itag), itags[j]))) {
|
if (ORTE_SUCCESS != (rc = orte_bitmap_set_bit(&(orte_gpr_replica_globals.srch_itag), itags[j]))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -76,7 +76,7 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
|
|||||||
}
|
}
|
||||||
if (ORTE_GPR_REPLICA_OR & addr_mode) { /* only need one match */
|
if (ORTE_GPR_REPLICA_OR & addr_mode) { /* only need one match */
|
||||||
if (not_set) return false;
|
if (not_set) return false;
|
||||||
else return true;
|
else return true;
|
||||||
}
|
}
|
||||||
match = true;
|
match = true;
|
||||||
found_one = true;
|
found_one = true;
|
||||||
@ -96,13 +96,13 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
|
|||||||
* that we would have already returned in the OR case. So, first check the XOR
|
* that we would have already returned in the OR case. So, first check the XOR
|
||||||
* case
|
* case
|
||||||
*/
|
*/
|
||||||
if (ORTE_GPR_REPLICA_XOR && found_one) {
|
if ((ORTE_GPR_REPLICA_XOR & addr_mode) && found_one) {
|
||||||
if (not_set) return false;
|
if (not_set) return false;
|
||||||
else return true;
|
else return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Only thing we have left to check is AND */
|
/* Only thing we have left to check is AND */
|
||||||
/* check if all the search tags were found */
|
/* check if any search tag was not found */
|
||||||
for (i=0; i < num_itags_search; i++) {
|
for (i=0; i < num_itags_search; i++) {
|
||||||
if (1 != orte_bitmap_is_set_bit(&(orte_gpr_replica_globals.srch_itag), itags[i])) {
|
if (1 != orte_bitmap_is_set_bit(&(orte_gpr_replica_globals.srch_itag), itags[i])) {
|
||||||
/* this tag was NOT found - required to find them all */
|
/* this tag was NOT found - required to find them all */
|
||||||
@ -110,7 +110,6 @@ bool orte_gpr_replica_check_itag_list(orte_gpr_replica_addr_mode_t addr_mode,
|
|||||||
else return false;
|
else return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* okay, all the tags are there, so we now passed the AND test */
|
/* okay, all the tags are there, so we now passed the AND test */
|
||||||
if (not_set) return false;
|
if (not_set) return false;
|
||||||
else return true;
|
else return true;
|
||||||
|
@ -19,6 +19,8 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "include/orte_constants.h"
|
#include "include/orte_constants.h"
|
||||||
|
#include "util/output.h"
|
||||||
|
|
||||||
#include "mca/errmgr/errmgr.h"
|
#include "mca/errmgr/errmgr.h"
|
||||||
#include "mca/soh/soh_types.h"
|
#include "mca/soh/soh_types.h"
|
||||||
#include "mca/gpr/gpr.h"
|
#include "mca/gpr/gpr.h"
|
||||||
@ -236,7 +238,7 @@ int orte_ras_base_node_insert(ompi_list_t* nodes)
|
|||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
value->addr_mode = ORTE_GPR_OVERWRITE;
|
value->addr_mode = ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND;
|
||||||
value->segment = strdup(ORTE_NODE_SEGMENT);
|
value->segment = strdup(ORTE_NODE_SEGMENT);
|
||||||
value->cnt = 6;
|
value->cnt = 6;
|
||||||
value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt*sizeof(orte_gpr_keyval_t*));
|
value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt*sizeof(orte_gpr_keyval_t*));
|
||||||
@ -392,7 +394,7 @@ int orte_ras_base_node_assign(ompi_list_t* nodes, orte_jobid_t jobid)
|
|||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
values[i]->addr_mode = ORTE_GPR_OVERWRITE;
|
values[i]->addr_mode = ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND;
|
||||||
values[i]->segment = strdup(ORTE_NODE_SEGMENT);
|
values[i]->segment = strdup(ORTE_NODE_SEGMENT);
|
||||||
values[i]->cnt = 1;
|
values[i]->cnt = 1;
|
||||||
values[i]->keyvals = (orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t*));
|
values[i]->keyvals = (orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t*));
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
#include "util/sys_info.h"
|
#include "util/sys_info.h"
|
||||||
#include "mca/mca.h"
|
#include "mca/mca.h"
|
||||||
#include "mca/base/base.h"
|
#include "mca/base/base.h"
|
||||||
|
#include "mca/ns/ns.h"
|
||||||
#include "mca/ras/base/ras_base_node.h"
|
#include "mca/ras/base/ras_base_node.h"
|
||||||
#include "mca/errmgr/errmgr.h"
|
#include "mca/errmgr/errmgr.h"
|
||||||
#include "rds_hostfile.h"
|
#include "rds_hostfile.h"
|
||||||
@ -84,6 +85,14 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
|
|||||||
node = OBJ_NEW(orte_ras_base_node_t);
|
node = OBJ_NEW(orte_ras_base_node_t);
|
||||||
node->node_name = strdup(node_name);
|
node->node_name = strdup(node_name);
|
||||||
node->node_slots = 1;
|
node->node_slots = 1;
|
||||||
|
/* get a new cellid for this node */
|
||||||
|
#if 0
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&(node->node_cellid),
|
||||||
|
"UNKNOWN-SITE", node->node_name))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
update++;
|
update++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -246,6 +255,7 @@ cleanup:
|
|||||||
while(NULL != (item = ompi_list_remove_first(&existing))) {
|
while(NULL != (item = ompi_list_remove_first(&existing))) {
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
while(NULL != (item = ompi_list_remove_first(&updates))) {
|
while(NULL != (item = ompi_list_remove_first(&updates))) {
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user