From 237c79b6d72d447aeb30263cce5d4483aad58b65 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 28 Nov 2011 22:24:49 +0000 Subject: [PATCH] Fix daemon collectives - missed the one spot where returning orte_routed_tree_t was required. Sigh. Change the routed components to return that type on the list of children when get_routing_tree is called. This commit was SVN r25516. --- orte/mca/plm/rsh/plm_rsh_module.c | 17 +++++++++-------- orte/mca/routed/binomial/routed_binomial.c | 10 +++++----- orte/mca/routed/cm/routed_cm.c | 9 ++++----- orte/mca/routed/direct/routed_direct.c | 8 +++----- orte/mca/routed/linear/routed_linear.c | 17 +++++++++++------ orte/mca/routed/radix/routed_radix.c | 10 +++++----- orte/mca/routed/routed.h | 2 +- orte/orted/orted_comm.c | 19 +++++++++++-------- 8 files changed, 49 insertions(+), 43 deletions(-) diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 3e93013285..8c81e8883c 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -641,7 +641,6 @@ static void ssh_child(int argc, char **argv, static int remote_spawn(opal_buffer_t *launch) { opal_list_item_t *item; - orte_vpid_t vpid; int node_name_index1; int proc_vpid_index; char **argv = NULL; @@ -652,6 +651,7 @@ static int remote_spawn(opal_buffer_t *launch) pid_t pid; orte_std_cntr_t n; opal_byte_object_t *bo; + orte_process_name_t target; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: remote spawn called", @@ -708,16 +708,17 @@ static int remote_spawn(opal_buffer_t *launch) goto cleanup; } + target.jobid = ORTE_PROC_MY_NAME->jobid; for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { - orte_namelist_t *child = (orte_namelist_t*)item; - vpid = child->name.vpid; + orte_routed_tree_t *child = (orte_routed_tree_t*)item; + target.vpid = child->vpid; /* get the host where this daemon resides */ - if (NULL == (hostname = orte_ess.proc_get_hostname(&child->name))) { + if (NULL == (hostname = orte_ess.proc_get_hostname(&target))) { opal_output(0, "%s unable to get hostname for daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(vpid)); + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->vpid)); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } @@ -741,7 +742,7 @@ static int remote_spawn(opal_buffer_t *launch) hostname)); /* do the ssh launch - this will exit if it fails */ - ssh_child(argc, argv, vpid, proc_vpid_index); + ssh_child(argc, argv, child->vpid, proc_vpid_index); } /* father */ @@ -758,7 +759,7 @@ static int remote_spawn(opal_buffer_t *launch) /* setup callback on sigchild - wait until setup above is complete * as the callback can occur in the call to orte_wait_cb */ - orte_wait_cb(pid, rsh_wait_daemon, (void*)&vpid); + orte_wait_cb(pid, rsh_wait_daemon, (void*)&child->vpid); } failed_launch = false; @@ -777,7 +778,7 @@ cleanup: OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR); opal_dss.pack(&buf, &flag, 1, OPAL_UINT8); - opal_dss.pack(&buf, &vpid, 1, ORTE_VPID); + opal_dss.pack(&buf, &target.vpid, 1, ORTE_VPID); orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0); OBJ_DESTRUCT(&buf); } diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 39a4bd0571..97602de770 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -1061,7 +1061,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) { opal_list_item_t *item; orte_routed_tree_t *child; - orte_namelist_t *nm; + orte_routed_tree_t *nm; /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route @@ -1078,10 +1078,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = ORTE_PROC_MY_NAME->jobid; - nm->name.vpid = child->vpid; - opal_list_append(children, &nm->item); + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = child->vpid; + opal_bitmap_copy(&nm->relatives, &child->relatives); + opal_list_append(children, &nm->super); } } diff --git a/orte/mca/routed/cm/routed_cm.c b/orte/mca/routed/cm/routed_cm.c index 3ef7ec8b5b..9f6d7566cd 100644 --- a/orte/mca/routed/cm/routed_cm.c +++ b/orte/mca/routed/cm/routed_cm.c @@ -834,7 +834,7 @@ static int update_routing_tree(orte_jobid_t jobid) static orte_vpid_t get_routing_tree(opal_list_t *children) { - orte_namelist_t *nm; + orte_routed_tree_t *nm; int32_t i; orte_job_t *jdata; orte_proc_t *proc; @@ -874,10 +874,9 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) ORTE_NAME_PRINT(&(proc->name)), proc->state)); - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = proc->name.jobid; - nm->name.vpid = proc->name.vpid; - opal_list_append(children, &nm->item); + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = proc->name.vpid; + opal_list_append(children, &nm->super); } else { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index ccc478c0fe..739072d994 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -330,7 +330,7 @@ static int update_routing_tree(orte_jobid_t jobid) static orte_vpid_t get_routing_tree(opal_list_t *children) { orte_vpid_t i; - orte_namelist_t *nm; + orte_routed_tree_t *nm; if (!ORTE_PROC_IS_HNP) { /* if I am not the HNP, there is nothing to do */ @@ -341,10 +341,8 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) * daemons so I can relay messages to them */ for (i=0; i < orte_process_info.num_procs; i++) { - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = ORTE_PROC_MY_NAME->jobid; - nm->name.vpid = i; - opal_list_append(children, &nm->item); + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = i; } return ORTE_VPID_INVALID; } diff --git a/orte/mca/routed/linear/routed_linear.c b/orte/mca/routed/linear/routed_linear.c index 80c1bb47b7..107a92d833 100644 --- a/orte/mca/routed/linear/routed_linear.c +++ b/orte/mca/routed/linear/routed_linear.c @@ -778,8 +778,9 @@ static int update_routing_tree(orte_jobid_t jobid) static orte_vpid_t get_routing_tree(opal_list_t *children) { - orte_namelist_t *nm; - + orte_routed_tree_t *nm; + orte_vpid_t v; + /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ @@ -794,10 +795,14 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) if (NULL != children && ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { /* my child is just the vpid+1 daemon */ - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = ORTE_PROC_MY_NAME->jobid; - nm->name.vpid = ORTE_PROC_MY_NAME->vpid + 1; - opal_list_append(children, &nm->item); + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = ORTE_PROC_MY_NAME->vpid + 1; + opal_bitmap_init(&nm->relatives, orte_process_info.num_procs); + /* my relatives are everyone above that point */ + for (v=nm->vpid+1; v < orte_process_info.num_procs; v++) { + opal_bitmap_set_bit(&nm->relatives, v); + } + opal_list_append(children, &nm->super); } if (ORTE_PROC_IS_HNP) { diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index 20190d1a91..5ac00a923b 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -989,7 +989,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) { opal_list_item_t *item; orte_routed_tree_t *child; - orte_namelist_t *nm; + orte_routed_tree_t *nm; /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route @@ -1006,10 +1006,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = ORTE_PROC_MY_NAME->jobid; - nm->name.vpid = child->vpid; - opal_list_append(children, &nm->item); + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = child->vpid; + opal_bitmap_copy(&nm->relatives, &child->relatives); + opal_list_append(children, &nm->super); } } /* return my parent's vpid */ diff --git a/orte/mca/routed/routed.h b/orte/mca/routed/routed.h index b4f584a1c8..cba46798b4 100644 --- a/orte/mca/routed/routed.h +++ b/orte/mca/routed/routed.h @@ -208,7 +208,7 @@ typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid); * in the routing tree, and returns the vpid of the parent. Only valid * when called by a daemon or the HNP. Passing a NULL pointer will result * in only the parent vpid being returned. The returned list will be filled - * with orte_namelist_t items. + * with orte_routed_tree_t items. */ typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *children); diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index dff9fbfdc5..e559f064a4 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -94,7 +94,8 @@ static void send_relay(opal_buffer_t *buf) { opal_list_t recips; opal_list_item_t *item; - orte_namelist_t *nm; + orte_routed_tree_t *nm; + orte_process_name_t target; int ret; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, @@ -115,23 +116,25 @@ static void send_relay(opal_buffer_t *buf) } /* send the message to each recipient on list, deconstructing it as we go */ + target.jobid = ORTE_PROC_MY_NAME->jobid; while (NULL != (item = opal_list_remove_first(&recips))) { - nm = (orte_namelist_t*)item; - - ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); + nm = (orte_routed_tree_t*)item; + target.vpid = nm->vpid; - if (!PROC_IS_RUNNING(&nm->name)) { + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); + + if (!PROC_IS_RUNNING(&target)) { continue; } - ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); + ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:send_relay sending relay msg to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&nm->name))); + ORTE_NAME_PRINT(&target))); - if (ORTE_SUCCESS != (ret = orte_comm(&nm->name, buf, ORTE_RML_TAG_DAEMON, + if (ORTE_SUCCESS != (ret = orte_comm(&target, buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor))) { ORTE_ERROR_LOG(ret); goto CLEANUP;