diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 118813eb70..9932706a64 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1383,14 +1383,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START); - } else if (NULL != orte_tree_launch_cmd) { - /* if a tree-launch is underway, send the cmd back */ - relay = OBJ_NEW(opal_buffer_t); - opal_dss.copy_payload(relay, orte_tree_launch_cmd); - orte_rml.send_buffer_nb(orte_mgmt_conduit, - sender, relay, - ORTE_RML_TAG_DAEMON, - orte_rml_send_callback, NULL); } } diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 7b5ae93a1a..94fe84f5c4 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -88,6 +88,7 @@ #include "orte/mca/ess/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/base/base.h" +#include "orte/mca/oob/base/base.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/routed/routed.h" #include "orte/mca/rml/base/rml_contact.h" @@ -605,7 +606,6 @@ static int setup_launch(int *argcptr, char ***argvptr, (mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) && ((!mca_plm_rsh_component.using_llspawn) || (mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) { - opal_argv_append(&argc, &argv, "--daemonize"); } /* @@ -617,9 +617,20 @@ static int setup_launch(int *argcptr, char ***argvptr, proc_vpid_index); /* ensure that only the ssh plm is selected on the remote daemon */ - opal_argv_append_nosize(&argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append_nosize(&argv, "plm"); - opal_argv_append_nosize(&argv, "rsh"); + opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(&argc, &argv, "plm"); + opal_argv_append(&argc, &argv, "rsh"); + + /* if we are tree-spawning, tell our child daemons the + * uri of their parent (me) */ + if (!mca_plm_rsh_component.no_tree_spawn) { + opal_argv_append(&argc, &argv, "--tree-spawn"); + orte_oob_base_get_addr(¶m); + opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(&argc, &argv, "orte_parent_uri"); + opal_argv_append(&argc, &argv, param); + free(param); + } /* unless told otherwise... */ if (mca_plm_rsh_component.pass_environ_mca_params) { @@ -795,11 +806,22 @@ static int remote_spawn(opal_buffer_t *launch) /* if we hit any errors, tell the HNP it was us */ target.vpid = ORTE_PROC_MY_NAME->vpid; - /* extract the prefix from the launch buffer */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if (NULL != launch) { + /* extract the prefix from the launch buffer */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } else { + /* check to see if enable-orterun-prefix-by-default was given - if + * this is being done by a singleton, then orterun will not be there + * to put the prefix in the app. So make sure we check to find it */ + if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) { + prefix = strdup(opal_install_dirs.prefix); + } else { + prefix = NULL; + } } /* get the updated routing list */ diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index ff6291d4df..bb3bbec90a 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -120,7 +120,7 @@ static void rollup(int status, orte_process_name_t* sender, static opal_buffer_t *bucket, *mybucket = NULL; static int ncollected = 0; -static char *orte_parent_uri; +static char *orte_parent_uri = NULL; static struct { bool debug; @@ -187,6 +187,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL, "Direct the orted to separate from the current session"}, + { NULL, '\0', "tree-spawn", "tree-spawn", 0, + &orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL, + "Tree-based spawn in progress" }, + { "tmpdir_base", '\0', NULL, "tmpdir", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Set the root for the session directory tree" }, @@ -667,22 +671,19 @@ int orte_daemon(int argc, char *argv[]) MCA_BASE_VAR_SCOPE_CONSTANT, &orte_parent_uri); if (NULL != orte_parent_uri) { - orte_process_name_t parent; opal_value_t val; /* set the contact info into our local database */ - ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL); + ret = orte_rml_base_parse_uris(orte_parent_uri, ORTE_PROC_MY_PARENT, NULL); if (ORTE_SUCCESS != ret) { ORTE_ERROR_LOG(ret); - free (orte_parent_uri); - orte_parent_uri = NULL; goto DONE; } OBJ_CONSTRUCT(&val, opal_value_t); val.key = OPAL_PMIX_PROC_URI; val.type = OPAL_STRING; val.data.string = orte_parent_uri; - if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) { + if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_PARENT, &val))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&val); goto DONE; @@ -691,21 +692,22 @@ int orte_daemon(int argc, char *argv[]) val.data.string = NULL; OBJ_DESTRUCT(&val); - /* don't need this value anymore */ - free(orte_parent_uri); - orte_parent_uri = NULL; - /* tell the routed module that we have a path * back to the HNP */ - if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) { + if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, ORTE_PROC_MY_PARENT))) { + ORTE_ERROR_LOG(ret); + goto DONE; + } + /* and a path to our parent */ + if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_PARENT, ORTE_PROC_MY_PARENT))) { ORTE_ERROR_LOG(ret); goto DONE; } /* set the lifeline to point to our parent so that we * can handle the situation if that lifeline goes away */ - if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) { + if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, ORTE_PROC_MY_PARENT))) { ORTE_ERROR_LOG(ret); goto DONE; } @@ -717,12 +719,15 @@ int orte_daemon(int argc, char *argv[]) */ if (!ORTE_PROC_IS_HNP) { orte_process_name_t target; - target.jobid = ORTE_PROC_MY_NAME->jobid; - if (orte_fwd_mpirun_port || orte_static_ports) { - /* setup the rollup callback */ - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK, - ORTE_RML_PERSISTENT, rollup, NULL); + /* setup the rollup callback */ + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK, + ORTE_RML_PERSISTENT, rollup, NULL); + + /* define the target jobid */ + target.jobid = ORTE_PROC_MY_NAME->jobid; + if (orte_fwd_mpirun_port || orte_static_ports || NULL != orte_parent_uri) { + /* we start by sending to ourselves */ target.vpid = ORTE_PROC_MY_NAME->vpid; /* since we will be waiting for any children to send us * their rollup info before sending to our parent, save @@ -789,7 +794,6 @@ int orte_daemon(int argc, char *argv[]) } OPAL_LIST_RELEASE(modex); } else { - opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key); /* single value */ flag = 1; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) { @@ -965,6 +969,8 @@ int orte_daemon(int argc, char *argv[]) i += 2; } } + /* now launch any child daemons of ours */ + orte_plm.remote_spawn(orte_tree_launch_cmd); } if (orte_debug_daemons_flag) { @@ -1053,8 +1059,6 @@ static void rollup(int status, orte_process_name_t* sender, int32_t i, flag, cnt; opal_value_t *kv; - /* xfer the contents of the rollup to our bucket */ - opal_dss.copy_payload(bucket, buffer); ncollected++; /* if the sender is ourselves, then we save that buffer @@ -1064,6 +1068,8 @@ static void rollup(int status, orte_process_name_t* sender, mybucket = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(mybucket, buffer); } else { + /* xfer the contents of the rollup to our bucket */ + opal_dss.copy_payload(bucket, buffer); /* the first entry in the bucket will be from our * direct child - harvest it for connection info */ cnt = 1;