1
1

Fix the radix routed component.

This commit was SVN r25175.
Этот коммит содержится в:
Ralph Castain 2011-09-22 09:32:53 +00:00
родитель 82c93611e6
Коммит 8347385630
2 изменённых файлов: 93 добавлений и 17 удалений

Просмотреть файл

@ -273,7 +273,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route))); ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid; jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid; jfam->route.vpid = route->vpid;
ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route));
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
} }
@ -287,7 +287,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily; jfam->job_family = jfamily;
jfam->route.jobid = route->jobid; jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid; jfam->route.vpid = route->vpid;
ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route));
opal_pointer_array_add(&orte_routed_jobfams, jfam); opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -309,9 +309,24 @@ static orte_process_name_t get_route(orte_process_name_t *target)
orte_routed_jobfam_t *jfam; orte_routed_jobfam_t *jfam;
uint16_t jfamily; uint16_t jfamily;
/* initialize */
daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch);
#if ORTE_ENABLE_EPOCH
if (target->jobid == ORTE_JOBID_INVALID || if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID || target->vpid == ORTE_VPID_INVALID ||
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { target->epoch == ORTE_EPOCH_INVALID) {
#else
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
#endif
ret = ORTE_NAME_INVALID;
goto found;
}
if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) {
ret = ORTE_NAME_INVALID; ret = ORTE_NAME_INVALID;
goto found; goto found;
} }
@ -328,6 +343,20 @@ static orte_process_name_t get_route(orte_process_name_t *target)
goto found; goto found;
} }
/* if I am a tool, the route is direct if target is in
* my own job family, and to the target's HNP if not
*/
if (ORTE_PROC_IS_TOOL) {
if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
ret = target;
goto found;
} else {
ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
ret = &daemon;
goto found;
}
}
/****** HNP AND DAEMONS ONLY ******/ /****** HNP AND DAEMONS ONLY ******/
/* if the job family is zero, then this is going to a local slave, /* if the job family is zero, then this is going to a local slave,
@ -734,6 +763,34 @@ static int route_lost(const orte_process_name_t *route)
{ {
opal_list_item_t *item; opal_list_item_t *item;
orte_routed_tree_t *child; orte_routed_tree_t *child;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
int i;
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(route)));
/* if the route is to a different job family and we are the HNP, look it up */
if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) &&
ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(route->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routed_radix: route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(route->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
break;
}
}
}
/* if we lose the connection to the lifeline and we are NOT already, /* if we lose the connection to the lifeline and we are NOT already,
* in finalize, tell the OOB to abort. * in finalize, tell the OOB to abort.
@ -772,6 +829,34 @@ static int route_lost(const orte_process_name_t *route)
static bool route_is_defined(const orte_process_name_t *target) static bool route_is_defined(const orte_process_name_t *target)
{ {
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
/* if the route is to a different job family and we are the HNP, look it up */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
if (ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routed_radix: route to %s is defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
return true;
}
}
return false;
}
/* if we are not the HNP, then the answer is always true as
* we send it via the HNP
*/
return true;
}
/* find out what daemon hosts this proc */ /* find out what daemon hosts this proc */
if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) {
return false; return false;

Просмотреть файл

@ -50,21 +50,12 @@ orte_routed_radix_component_t mca_routed_radix_component = {
static int orte_routed_radix_component_query(mca_base_module_t **module, int *priority) static int orte_routed_radix_component_query(mca_base_module_t **module, int *priority)
{ {
int tmp;
mca_base_component_t *c = &mca_routed_radix_component.super.base_version; mca_base_component_t *c = &mca_routed_radix_component.super.base_version;
mca_base_param_reg_int(c, NULL, mca_base_param_reg_int(c, NULL,
"Radix to be used for routed radix tree", "Radix to be used for routed radix tree",
false, false, -1, &tmp); false, false, 32, &mca_routed_radix_component.radix);
if (0 < tmp) { *priority = 65;
mca_routed_radix_component.radix = tmp;
*priority = 150;
*module = (mca_base_module_t *) &orte_routed_radix_module; *module = (mca_base_module_t *) &orte_routed_radix_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;
}
/* if radix not provided, then we can't run */
*priority = 0;
*module = NULL;
return ORTE_ERROR;
} }