1
1

Fix daemon collectives - missed the one spot where returning orte_routed_tree_t was required. Sigh. Change the routed components to return that type on the list of children when get_routing_tree is called.

This commit was SVN r25516.
Этот коммит содержится в:
Ralph Castain 2011-11-28 22:24:49 +00:00
родитель 0bd2bf9aae
Коммит 237c79b6d7
8 изменённых файлов: 49 добавлений и 43 удалений

@ -641,7 +641,6 @@ static void ssh_child(int argc, char **argv,
static int remote_spawn(opal_buffer_t *launch)
{
opal_list_item_t *item;
orte_vpid_t vpid;
int node_name_index1;
int proc_vpid_index;
char **argv = NULL;
@ -652,6 +651,7 @@ static int remote_spawn(opal_buffer_t *launch)
pid_t pid;
orte_std_cntr_t n;
opal_byte_object_t *bo;
orte_process_name_t target;
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: remote spawn called",
@ -708,16 +708,17 @@ static int remote_spawn(opal_buffer_t *launch)
goto cleanup;
}
target.jobid = ORTE_PROC_MY_NAME->jobid;
for (item = opal_list_get_first(&my_children);
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
orte_namelist_t *child = (orte_namelist_t*)item;
vpid = child->name.vpid;
orte_routed_tree_t *child = (orte_routed_tree_t*)item;
target.vpid = child->vpid;
/* get the host where this daemon resides */
if (NULL == (hostname = orte_ess.proc_get_hostname(&child->name))) {
if (NULL == (hostname = orte_ess.proc_get_hostname(&target))) {
opal_output(0, "%s unable to get hostname for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(vpid));
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->vpid));
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
@ -741,7 +742,7 @@ static int remote_spawn(opal_buffer_t *launch)
hostname));
/* do the ssh launch - this will exit if it fails */
ssh_child(argc, argv, vpid, proc_vpid_index);
ssh_child(argc, argv, child->vpid, proc_vpid_index);
}
/* father */
@ -758,7 +759,7 @@ static int remote_spawn(opal_buffer_t *launch)
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/
orte_wait_cb(pid, rsh_wait_daemon, (void*)&vpid);
orte_wait_cb(pid, rsh_wait_daemon, (void*)&child->vpid);
}
failed_launch = false;
@ -777,7 +778,7 @@ cleanup:
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR);
opal_dss.pack(&buf, &flag, 1, OPAL_UINT8);
opal_dss.pack(&buf, &vpid, 1, ORTE_VPID);
opal_dss.pack(&buf, &target.vpid, 1, ORTE_VPID);
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0);
OBJ_DESTRUCT(&buf);
}

@ -1061,7 +1061,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
{
opal_list_item_t *item;
orte_routed_tree_t *child;
orte_namelist_t *nm;
orte_routed_tree_t *nm;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
@ -1078,10 +1078,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = child->vpid;
opal_list_append(children, &nm->item);
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = child->vpid;
opal_bitmap_copy(&nm->relatives, &child->relatives);
opal_list_append(children, &nm->super);
}
}

@ -834,7 +834,7 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_namelist_t *nm;
orte_routed_tree_t *nm;
int32_t i;
orte_job_t *jdata;
orte_proc_t *proc;
@ -874,10 +874,9 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
ORTE_NAME_PRINT(&(proc->name)),
proc->state));
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = proc->name.jobid;
nm->name.vpid = proc->name.vpid;
opal_list_append(children, &nm->item);
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = proc->name.vpid;
opal_list_append(children, &nm->super);
}
else {
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,

@ -330,7 +330,7 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_vpid_t i;
orte_namelist_t *nm;
orte_routed_tree_t *nm;
if (!ORTE_PROC_IS_HNP) {
/* if I am not the HNP, there is nothing to do */
@ -341,10 +341,8 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
* daemons so I can relay messages to them
*/
for (i=0; i < orte_process_info.num_procs; i++) {
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = i;
opal_list_append(children, &nm->item);
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = i;
}
return ORTE_VPID_INVALID;
}

@ -778,8 +778,9 @@ static int update_routing_tree(orte_jobid_t jobid)
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
orte_namelist_t *nm;
orte_routed_tree_t *nm;
orte_vpid_t v;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
*/
@ -794,10 +795,14 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
if (NULL != children &&
ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) {
/* my child is just the vpid+1 daemon */
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = ORTE_PROC_MY_NAME->vpid + 1;
opal_list_append(children, &nm->item);
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = ORTE_PROC_MY_NAME->vpid + 1;
opal_bitmap_init(&nm->relatives, orte_process_info.num_procs);
/* my relatives are everyone above that point */
for (v=nm->vpid+1; v < orte_process_info.num_procs; v++) {
opal_bitmap_set_bit(&nm->relatives, v);
}
opal_list_append(children, &nm->super);
}
if (ORTE_PROC_IS_HNP) {

@ -989,7 +989,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
{
opal_list_item_t *item;
orte_routed_tree_t *child;
orte_namelist_t *nm;
orte_routed_tree_t *nm;
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route
@ -1006,10 +1006,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
nm->name.vpid = child->vpid;
opal_list_append(children, &nm->item);
nm = OBJ_NEW(orte_routed_tree_t);
nm->vpid = child->vpid;
opal_bitmap_copy(&nm->relatives, &child->relatives);
opal_list_append(children, &nm->super);
}
}
/* return my parent's vpid */

@ -208,7 +208,7 @@ typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid);
* in the routing tree, and returns the vpid of the parent. Only valid
* when called by a daemon or the HNP. Passing a NULL pointer will result
* in only the parent vpid being returned. The returned list will be filled
* with orte_namelist_t items.
* with orte_routed_tree_t items.
*/
typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *children);

@ -94,7 +94,8 @@ static void send_relay(opal_buffer_t *buf)
{
opal_list_t recips;
opal_list_item_t *item;
orte_namelist_t *nm;
orte_routed_tree_t *nm;
orte_process_name_t target;
int ret;
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
@ -115,23 +116,25 @@ static void send_relay(opal_buffer_t *buf)
}
/* send the message to each recipient on list, deconstructing it as we go */
target.jobid = ORTE_PROC_MY_NAME->jobid;
while (NULL != (item = opal_list_remove_first(&recips))) {
nm = (orte_namelist_t*)item;
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
nm = (orte_routed_tree_t*)item;
target.vpid = nm->vpid;
if (!PROC_IS_RUNNING(&nm->name)) {
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
if (!PROC_IS_RUNNING(&target)) {
continue;
}
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s orte:daemon:send_relay sending relay msg to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&nm->name)));
ORTE_NAME_PRINT(&target)));
if (ORTE_SUCCESS != (ret = orte_comm(&nm->name, buf, ORTE_RML_TAG_DAEMON,
if (ORTE_SUCCESS != (ret = orte_comm(&target, buf, ORTE_RML_TAG_DAEMON,
orte_daemon_cmd_processor))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;