Fix daemon collectives - missed the one spot where returning orte_routed_tree_t was required. Sigh. Change the routed components to return that type on the list of children when get_routing_tree is called.
This commit was SVN r25516.
Этот коммит содержится в:
родитель
0bd2bf9aae
Коммит
237c79b6d7
orte
@ -641,7 +641,6 @@ static void ssh_child(int argc, char **argv,
|
||||
static int remote_spawn(opal_buffer_t *launch)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_vpid_t vpid;
|
||||
int node_name_index1;
|
||||
int proc_vpid_index;
|
||||
char **argv = NULL;
|
||||
@ -652,6 +651,7 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
pid_t pid;
|
||||
orte_std_cntr_t n;
|
||||
opal_byte_object_t *bo;
|
||||
orte_process_name_t target;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rsh: remote spawn called",
|
||||
@ -708,16 +708,17 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
target.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_namelist_t *child = (orte_namelist_t*)item;
|
||||
vpid = child->name.vpid;
|
||||
orte_routed_tree_t *child = (orte_routed_tree_t*)item;
|
||||
target.vpid = child->vpid;
|
||||
|
||||
/* get the host where this daemon resides */
|
||||
if (NULL == (hostname = orte_ess.proc_get_hostname(&child->name))) {
|
||||
if (NULL == (hostname = orte_ess.proc_get_hostname(&target))) {
|
||||
opal_output(0, "%s unable to get hostname for daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(vpid));
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->vpid));
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -741,7 +742,7 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
hostname));
|
||||
|
||||
/* do the ssh launch - this will exit if it fails */
|
||||
ssh_child(argc, argv, vpid, proc_vpid_index);
|
||||
ssh_child(argc, argv, child->vpid, proc_vpid_index);
|
||||
|
||||
}
|
||||
/* father */
|
||||
@ -758,7 +759,7 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
/* setup callback on sigchild - wait until setup above is complete
|
||||
* as the callback can occur in the call to orte_wait_cb
|
||||
*/
|
||||
orte_wait_cb(pid, rsh_wait_daemon, (void*)&vpid);
|
||||
orte_wait_cb(pid, rsh_wait_daemon, (void*)&child->vpid);
|
||||
}
|
||||
|
||||
failed_launch = false;
|
||||
@ -777,7 +778,7 @@ cleanup:
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR);
|
||||
opal_dss.pack(&buf, &flag, 1, OPAL_UINT8);
|
||||
opal_dss.pack(&buf, &vpid, 1, ORTE_VPID);
|
||||
opal_dss.pack(&buf, &target.vpid, 1, ORTE_VPID);
|
||||
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
|
@ -1061,7 +1061,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_routed_tree_t *child;
|
||||
orte_namelist_t *nm;
|
||||
orte_routed_tree_t *nm;
|
||||
|
||||
/* if I am anything other than a daemon or the HNP, this
|
||||
* is a meaningless command as I am not allowed to route
|
||||
@ -1078,10 +1078,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
nm->name.vpid = child->vpid;
|
||||
opal_list_append(children, &nm->item);
|
||||
nm = OBJ_NEW(orte_routed_tree_t);
|
||||
nm->vpid = child->vpid;
|
||||
opal_bitmap_copy(&nm->relatives, &child->relatives);
|
||||
opal_list_append(children, &nm->super);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -834,7 +834,7 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
{
|
||||
orte_namelist_t *nm;
|
||||
orte_routed_tree_t *nm;
|
||||
int32_t i;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
@ -874,10 +874,9 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
ORTE_NAME_PRINT(&(proc->name)),
|
||||
proc->state));
|
||||
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = proc->name.jobid;
|
||||
nm->name.vpid = proc->name.vpid;
|
||||
opal_list_append(children, &nm->item);
|
||||
nm = OBJ_NEW(orte_routed_tree_t);
|
||||
nm->vpid = proc->name.vpid;
|
||||
opal_list_append(children, &nm->super);
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||
|
@ -330,7 +330,7 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
{
|
||||
orte_vpid_t i;
|
||||
orte_namelist_t *nm;
|
||||
orte_routed_tree_t *nm;
|
||||
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
/* if I am not the HNP, there is nothing to do */
|
||||
@ -341,10 +341,8 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
* daemons so I can relay messages to them
|
||||
*/
|
||||
for (i=0; i < orte_process_info.num_procs; i++) {
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
nm->name.vpid = i;
|
||||
opal_list_append(children, &nm->item);
|
||||
nm = OBJ_NEW(orte_routed_tree_t);
|
||||
nm->vpid = i;
|
||||
}
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
@ -778,8 +778,9 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
{
|
||||
orte_namelist_t *nm;
|
||||
|
||||
orte_routed_tree_t *nm;
|
||||
orte_vpid_t v;
|
||||
|
||||
/* if I am anything other than a daemon or the HNP, this
|
||||
* is a meaningless command as I am not allowed to route
|
||||
*/
|
||||
@ -794,10 +795,14 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
if (NULL != children &&
|
||||
ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) {
|
||||
/* my child is just the vpid+1 daemon */
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
nm->name.vpid = ORTE_PROC_MY_NAME->vpid + 1;
|
||||
opal_list_append(children, &nm->item);
|
||||
nm = OBJ_NEW(orte_routed_tree_t);
|
||||
nm->vpid = ORTE_PROC_MY_NAME->vpid + 1;
|
||||
opal_bitmap_init(&nm->relatives, orte_process_info.num_procs);
|
||||
/* my relatives are everyone above that point */
|
||||
for (v=nm->vpid+1; v < orte_process_info.num_procs; v++) {
|
||||
opal_bitmap_set_bit(&nm->relatives, v);
|
||||
}
|
||||
opal_list_append(children, &nm->super);
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
|
@ -989,7 +989,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_routed_tree_t *child;
|
||||
orte_namelist_t *nm;
|
||||
orte_routed_tree_t *nm;
|
||||
|
||||
/* if I am anything other than a daemon or the HNP, this
|
||||
* is a meaningless command as I am not allowed to route
|
||||
@ -1006,10 +1006,10 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
nm->name.vpid = child->vpid;
|
||||
opal_list_append(children, &nm->item);
|
||||
nm = OBJ_NEW(orte_routed_tree_t);
|
||||
nm->vpid = child->vpid;
|
||||
opal_bitmap_copy(&nm->relatives, &child->relatives);
|
||||
opal_list_append(children, &nm->super);
|
||||
}
|
||||
}
|
||||
/* return my parent's vpid */
|
||||
|
@ -208,7 +208,7 @@ typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid);
|
||||
* in the routing tree, and returns the vpid of the parent. Only valid
|
||||
* when called by a daemon or the HNP. Passing a NULL pointer will result
|
||||
* in only the parent vpid being returned. The returned list will be filled
|
||||
* with orte_namelist_t items.
|
||||
* with orte_routed_tree_t items.
|
||||
*/
|
||||
typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *children);
|
||||
|
||||
|
@ -94,7 +94,8 @@ static void send_relay(opal_buffer_t *buf)
|
||||
{
|
||||
opal_list_t recips;
|
||||
opal_list_item_t *item;
|
||||
orte_namelist_t *nm;
|
||||
orte_routed_tree_t *nm;
|
||||
orte_process_name_t target;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
@ -115,23 +116,25 @@ static void send_relay(opal_buffer_t *buf)
|
||||
}
|
||||
|
||||
/* send the message to each recipient on list, deconstructing it as we go */
|
||||
target.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
while (NULL != (item = opal_list_remove_first(&recips))) {
|
||||
nm = (orte_namelist_t*)item;
|
||||
|
||||
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
|
||||
nm = (orte_routed_tree_t*)item;
|
||||
target.vpid = nm->vpid;
|
||||
|
||||
if (!PROC_IS_RUNNING(&nm->name)) {
|
||||
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
|
||||
|
||||
if (!PROC_IS_RUNNING(&target)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
|
||||
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
"%s orte:daemon:send_relay sending relay msg to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&nm->name)));
|
||||
ORTE_NAME_PRINT(&target)));
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_comm(&nm->name, buf, ORTE_RML_TAG_DAEMON,
|
||||
if (ORTE_SUCCESS != (ret = orte_comm(&target, buf, ORTE_RML_TAG_DAEMON,
|
||||
orte_daemon_cmd_processor))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user