Enable comm_spawn of slave processes, currently only active for the rsh, slurm, and tm environments. Establish support for local rsh environments in the plm/base so that rsh of local slaves can be done by any environment that supports it. Create new orte_rsh_agent param so users can specify rsh agent from outside of rsh plm, and sym link that to the old plm_rsh_agent and pls_rsh_agent options.
Modify the orte-bootproxy to pass prefix for the remote slave to support hetero/hybrid scenarios This commit was SVN r20492.
Этот коммит содержится в:
родитель
631d7d2a85
Коммит
f0af389910
@ -35,13 +35,27 @@
|
||||
|
||||
char* ompi_dpm_base_dyn_init (void)
|
||||
{
|
||||
char *envvarname=NULL, *port_name=NULL;
|
||||
char *envvarname=NULL, *port_name=NULL, *tmp, *ptr;
|
||||
|
||||
/* check for appropriate env variable */
|
||||
asprintf(&envvarname, "OMPI_PARENT_PORT");
|
||||
port_name = getenv(envvarname);
|
||||
tmp = getenv(envvarname);
|
||||
free (envvarname);
|
||||
|
||||
if (NULL != tmp) {
|
||||
/* the value passed to us may have quote marks around it to protect
|
||||
* the value if passed on the command line. We must remove those
|
||||
* to have a correct string
|
||||
*/
|
||||
if ('"' == tmp[0]) {
|
||||
/* if the first char is a quote, then so will the last one be */
|
||||
tmp[strlen(tmp)-1] = '\0';
|
||||
ptr = &tmp[1];
|
||||
} else {
|
||||
ptr = &tmp[0];
|
||||
}
|
||||
port_name = strdup(ptr);
|
||||
}
|
||||
|
||||
return port_name;
|
||||
}
|
||||
|
||||
|
@ -37,5 +37,6 @@ libmca_plm_la_SOURCES += \
|
||||
base/plm_base_jobid.c \
|
||||
base/plm_base_proxy.c \
|
||||
base/plm_base_orted_cmds.c \
|
||||
base/plm_base_rsh_support.c \
|
||||
base/plm_base_heartbeat.c
|
||||
endif
|
||||
|
@ -23,6 +23,7 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -65,5 +66,13 @@ int orte_plm_base_close(void)
|
||||
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_lock);
|
||||
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_cond);
|
||||
|
||||
/* clearout the rsh support */
|
||||
if (NULL != orte_plm_globals.rsh_agent_argv) {
|
||||
opal_argv_free(orte_plm_globals.rsh_agent_argv);
|
||||
}
|
||||
if (NULL != orte_plm_globals.rsh_agent_path) {
|
||||
free(orte_plm_globals.rsh_agent_path);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -103,6 +103,11 @@ int orte_plm_base_open(void)
|
||||
/* init the next jobid */
|
||||
orte_plm_globals.next_jobid = 0;
|
||||
|
||||
/* init the rsh support */
|
||||
orte_plm_globals.rsh_agent_argv = NULL;
|
||||
orte_plm_globals.rsh_agent_path = NULL;
|
||||
orte_plm_globals.local_slaves = 0;
|
||||
|
||||
/* Open up all the components that we can find */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
@ -113,10 +113,14 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
struct timeval beat;
|
||||
orte_app_context_t **apps, **child_apps;
|
||||
|
||||
/* setup a default response */
|
||||
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
||||
job = ORTE_JOBID_INVALID;
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &command, &count, ORTE_PLM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
@ -124,9 +128,6 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive job launch command",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* setup a default response */
|
||||
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
||||
job = ORTE_JOBID_INVALID;
|
||||
|
||||
/* unpack the job object */
|
||||
count = 1;
|
||||
@ -138,8 +139,14 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
/* if is a LOCAL slave cmd */
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* In this case, I cannot lookup job info. All I do is pass
|
||||
* this along to the local launcher
|
||||
* this along to the local launcher, IF it is available
|
||||
*/
|
||||
if (NULL == orte_plm.spawn) {
|
||||
/* can't do this operation */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
||||
rc = ORTE_ERR_NOT_SUPPORTED;
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
@ -206,7 +213,6 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
if (0 > (ret = orte_rml.send_buffer(&mev->sender, &answer, ORTE_RML_TAG_PLM_PROXY, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
break;
|
||||
|
||||
case ORTE_PLM_UPDATE_PROC_STATE:
|
||||
@ -303,9 +309,10 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
||||
|
||||
/* release the message */
|
||||
OBJ_RELEASE(mev);
|
||||
|
||||
/* see if an error occurred - if so, wakeup so we can exit */
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
OBJ_DESTRUCT(&answer);
|
||||
|
||||
/* see if an error occurred - if so, wakeup the HNP so we can exit */
|
||||
if (orte_process_info.hnp && ORTE_SUCCESS != rc) {
|
||||
orte_trigger_event(&orte_exit);
|
||||
}
|
||||
}
|
||||
|
502
orte/mca/plm/base/plm_base_rsh_support.c
Обычный файл
502
orte/mca/plm/base/plm_base_rsh_support.c
Обычный файл
@ -0,0 +1,502 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
|
||||
static char **search(const char* agent_list);
|
||||
|
||||
int orte_plm_base_rsh_launch_agent_setup(void)
|
||||
{
|
||||
char *bname;
|
||||
int i;
|
||||
|
||||
/* if no agent was provided, then report not found */
|
||||
if (NULL == orte_rsh_agent) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Take the orte_rsh_agent MCA param and search for the argv */
|
||||
orte_plm_globals.rsh_agent_argv = search(orte_rsh_agent);
|
||||
|
||||
if (0 == opal_argv_count(orte_plm_globals.rsh_agent_argv)) {
|
||||
/* nothing was found */
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* see if we can find the agent in the path */
|
||||
orte_plm_globals.rsh_agent_path =
|
||||
opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK,
|
||||
environ, NULL);
|
||||
|
||||
if (NULL == orte_plm_globals.rsh_agent_path) {
|
||||
/* not an error - just report not found */
|
||||
opal_argv_free(orte_plm_globals.rsh_agent_argv);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
bname = opal_basename(orte_plm_globals.rsh_agent_argv[0]);
|
||||
if (NULL != bname && 0 == strcmp(bname, "ssh")) {
|
||||
/* if xterm option was given, add '-X', ensuring we don't do it twice */
|
||||
if (NULL != orte_xterm) {
|
||||
opal_argv_append_unique_nosize(&orte_plm_globals.rsh_agent_argv, "-X");
|
||||
} else if (0 >= opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
/* if debug was not specified, and the user didn't explicitly
|
||||
* specify X11 forwarding/non-forwarding, add "-x" if it
|
||||
* isn't already there (check either case)
|
||||
*/
|
||||
for (i = 1; NULL != orte_plm_globals.rsh_agent_argv[i]; ++i) {
|
||||
if (0 == strcasecmp("-x",
|
||||
orte_plm_globals.rsh_agent_argv[i])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == orte_plm_globals.rsh_agent_argv[i]) {
|
||||
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-x");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static bool ack_recvd;
|
||||
|
||||
static void release_ack(int fd, short event, void *data)
|
||||
{
|
||||
orte_message_event_t *mev = (orte_message_event_t*)data;
|
||||
ack_recvd = true;
|
||||
OBJ_RELEASE(mev);
|
||||
}
|
||||
|
||||
static void recv_ack(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
/* don't process this right away - we need to get out of the recv before
|
||||
* we process the message as it may ask us to do something that involves
|
||||
* more messaging! Instead, setup an event so that the message gets processed
|
||||
* as soon as we leave the recv.
|
||||
*
|
||||
* The macro makes a copy of the buffer, which we release above - the incoming
|
||||
* buffer, however, is NOT released here, although its payload IS transferred
|
||||
* to the message buffer for later processing
|
||||
*/
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack);
|
||||
}
|
||||
|
||||
static void set_handler_default(int sig)
|
||||
{
|
||||
struct sigaction act;
|
||||
|
||||
act.sa_handler = SIG_DFL;
|
||||
act.sa_flags = 0;
|
||||
sigemptyset(&act.sa_mask);
|
||||
|
||||
sigaction(sig, &act, (struct sigaction *)0);
|
||||
}
|
||||
|
||||
int orte_plm_base_local_slave_launch(orte_job_t *jdata)
|
||||
{
|
||||
char **argv;
|
||||
opal_list_t hosts;
|
||||
orte_node_t *node;
|
||||
char *nodename, *bootproxy, *cmd, *scp=NULL;
|
||||
char *exefile=NULL, *basename, *param, *path=NULL, *bppath=NULL;
|
||||
char *exec_path=NULL;
|
||||
char *tmp;
|
||||
bool flag;
|
||||
orte_app_context_t **apps, *app;
|
||||
int i;
|
||||
int rc;
|
||||
pid_t pid;
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
sigset_t sigs;
|
||||
bool local_op = false;
|
||||
|
||||
/* increment the local slave jobid */
|
||||
orte_plm_globals.local_slaves++;
|
||||
|
||||
/* point to the apps array */
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
app = apps[0];
|
||||
|
||||
/* identify the target host - can only be one! */
|
||||
OBJ_CONSTRUCT(&hosts, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&hosts, &flag, app->dash_host))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&hosts);
|
||||
return rc;
|
||||
}
|
||||
if (1 < opal_list_get_size(&hosts)) {
|
||||
opal_output(0, "too many hosts: %d", (int)opal_list_get_size(&hosts));
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node = (orte_node_t*)opal_list_remove_first(&hosts);
|
||||
nodename = strdup(node->name);
|
||||
OBJ_RELEASE(node);
|
||||
OBJ_DESTRUCT(&hosts);
|
||||
|
||||
/* is this a local operation? */
|
||||
opal_output(0, "local: %s node: %s", orte_process_info.nodename, nodename);
|
||||
if (0 == strcmp(orte_process_info.nodename, nodename)) {
|
||||
local_op = true;
|
||||
}
|
||||
|
||||
/* find the bootproxy */
|
||||
bootproxy = opal_find_absolute_path("orte-bootproxy.sh");
|
||||
|
||||
/* do we need to preload the binary? */
|
||||
if(app->preload_binary) {
|
||||
/* the target location -must- be an absolute path */
|
||||
if (NULL == app->preload_files_dest_dir ||
|
||||
!opal_path_is_absolute(app->preload_files_dest_dir)) {
|
||||
opal_output(0, "target location must be given and an absolute path: %s",
|
||||
(NULL == app->preload_files_dest_dir) ? "NULL" : app->preload_files_dest_dir);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* if the binary is not given in absolute path form,
|
||||
* then convert it to one
|
||||
*/
|
||||
if (!opal_path_is_absolute(app->app)) {
|
||||
exefile = opal_find_absolute_path(app->app);
|
||||
if (NULL == exefile) {
|
||||
opal_output(0, "could not find executable %s", app->app);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
} else {
|
||||
exefile = strdup(app->app);
|
||||
}
|
||||
/* construct the target path */
|
||||
basename = opal_basename(exefile);
|
||||
path = opal_os_path(false, app->preload_files_dest_dir, basename, NULL);
|
||||
free(basename);
|
||||
/* we are going to use the "bootproxy" script to launch
|
||||
* this job - so move it over to the target host as well
|
||||
*/
|
||||
bppath = opal_os_path(false, app->preload_files_dest_dir, "orte-bootproxy.sh", NULL);
|
||||
/* if this is a local node, then we just use the cp command */
|
||||
if (local_op) {
|
||||
scp = opal_find_absolute_path("cp");
|
||||
if (NULL == scp) {
|
||||
opal_output(0, "could not find cp");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* form and execute the cp commands */
|
||||
asprintf(&cmd, "%s %s %s", scp, exefile, path);
|
||||
system(cmd);
|
||||
free(cmd);
|
||||
asprintf(&cmd, "%s %s %s", scp, bootproxy, bppath);
|
||||
system(cmd);
|
||||
free(cmd);
|
||||
/* start the argv with the bootproxy cmd */
|
||||
argv = NULL;
|
||||
opal_argv_append_nosize(&argv, "orte-bootproxy.sh");
|
||||
/* set the exec path to bppath */
|
||||
exec_path = strdup(bppath);
|
||||
} else {
|
||||
/* find the scp command */
|
||||
scp = opal_find_absolute_path("scp");
|
||||
if (NULL == scp) {
|
||||
opal_output(0, "could not find scp");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* form and execute the scp commands */
|
||||
asprintf(&cmd, "%s %s %s:%s", scp, exefile, nodename, path);
|
||||
system(cmd);
|
||||
free(cmd);
|
||||
asprintf(&cmd, "%s %s %s:%s", scp, bootproxy, nodename, bppath);
|
||||
system(cmd);
|
||||
free(cmd);
|
||||
/* set the exec path to the agent path */
|
||||
exec_path = strdup(orte_plm_globals.rsh_agent_path);
|
||||
/* Start the argv with the rsh/ssh command */
|
||||
argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv);
|
||||
/* add the hostname */
|
||||
opal_argv_append_nosize(&argv, nodename);
|
||||
/* add the bootproxy cmd */
|
||||
opal_argv_append_nosize(&argv, bootproxy);
|
||||
}
|
||||
}
|
||||
if (NULL != exefile) {
|
||||
free(exefile);
|
||||
}
|
||||
if (NULL != path) {
|
||||
free(path);
|
||||
}
|
||||
if (NULL != bppath) {
|
||||
free(bppath);
|
||||
}
|
||||
/* release the scp command */
|
||||
if (NULL != scp) {
|
||||
free(scp);
|
||||
}
|
||||
|
||||
/* done with bootproxy */
|
||||
free(bootproxy);
|
||||
|
||||
/* if there is a prefix, add it in a special way so the bootproxy
|
||||
* can deal with it
|
||||
*/
|
||||
if (NULL != app->prefix_dir) {
|
||||
asprintf(¶m, "OMPI_PREFIX=%s", app->prefix_dir);
|
||||
opal_argv_append_nosize(&argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* add all OMPI params from the app */
|
||||
if (NULL != app->env) {
|
||||
for (i=0; NULL != app->env[i]; i++) {
|
||||
if (0 == strncmp(app->env[i], "OMPI_", 5)) {
|
||||
if (NULL == strchr(app->env[i], ';') &&
|
||||
NULL == strchr(app->env[i], ':')) {
|
||||
opal_argv_append_nosize(&argv, app->env[i]);
|
||||
} else {
|
||||
tmp = strchr(app->env[i], '=');
|
||||
*tmp = '\0';
|
||||
tmp++;
|
||||
asprintf(¶m, "%s=\"%s\"", app->env[i], tmp);
|
||||
opal_argv_append_nosize(&argv, param);
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* add MCA params required for launch */
|
||||
|
||||
/* tell ESS to select the "slave" component */
|
||||
param = mca_base_param_environ_variable("ess",NULL,NULL);
|
||||
opal_setenv(param, "slave", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* tell ROUTED to select the "slave" component */
|
||||
param = mca_base_param_environ_variable("routed",NULL,NULL);
|
||||
opal_setenv(param, "slave", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* tell GRPCOMM to select the "hier" component */
|
||||
param = mca_base_param_environ_variable("grpcomm",NULL,NULL);
|
||||
opal_setenv(param, "hier", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* must tell "hier" two pieces of info */
|
||||
param = mca_base_param_environ_variable("grpcomm","hier","num_nodes");
|
||||
opal_setenv(param, "1", true, &argv);
|
||||
free(param);
|
||||
param = mca_base_param_environ_variable("grpcomm","hier","step");
|
||||
opal_setenv(param, "1", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* set the daemon uri to point to me */
|
||||
param = mca_base_param_environ_variable("orte","local_daemon","uri");
|
||||
asprintf(&path, "\"%s\"", orte_rml.get_contact_info());
|
||||
opal_setenv(param, path, true, &argv);
|
||||
free(param);
|
||||
free(path);
|
||||
|
||||
/* set a value for the HNP uri - it won't be needed, but is
|
||||
* required to pass existence tests
|
||||
*/
|
||||
param = mca_base_param_environ_variable("orte","hnp","uri");
|
||||
asprintf(&path, "\"%s\"", orte_process_info.my_hnp_uri);
|
||||
opal_setenv(param, path, true, &argv);
|
||||
free(param);
|
||||
free(path);
|
||||
|
||||
/* setup yield schedule to be aggressive */
|
||||
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||
opal_setenv(param, "0", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* set the app_context number */
|
||||
param = mca_base_param_environ_variable("orte","app","num");
|
||||
opal_setenv(param, "1", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* ensure that any "name" envar is cleared */
|
||||
param = mca_base_param_environ_variable("orte","ess","name");
|
||||
opal_unsetenv(param, &argv);
|
||||
free(param);
|
||||
|
||||
/* set the jobid */
|
||||
orte_util_convert_jobid_to_string(&cmd, orte_plm_globals.local_slaves);
|
||||
param = mca_base_param_environ_variable("orte","ess","jobid");
|
||||
opal_setenv(param, cmd, true, &argv);
|
||||
free(param);
|
||||
free(cmd);
|
||||
/* set the jobid in jdata so the caller knows what it is */
|
||||
jdata->jobid = orte_plm_globals.local_slaves;
|
||||
|
||||
/* set the vpid to 0 */
|
||||
param = mca_base_param_environ_variable("orte","ess","vpid");
|
||||
opal_setenv(param, "0", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* set the number of procs */
|
||||
param = mca_base_param_environ_variable("orte","ess","num_procs");
|
||||
opal_setenv(param, "1", true, &argv);
|
||||
free(param);
|
||||
|
||||
/* some user-requested public environmental variables */
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", "0", true, &argv);
|
||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", "0", true, &argv);
|
||||
opal_setenv("OMPI_UNIVERSE_SIZE", "1", true, &argv);
|
||||
opal_setenv("OMPI_COMM_WORLD_SIZE", "1", true, &argv);
|
||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", "1", true, &argv);
|
||||
|
||||
/* add the provided argv*/
|
||||
for (i=0; NULL != app->argv[i]; i++) {
|
||||
opal_argv_append_nosize(&argv, app->argv[i]);
|
||||
}
|
||||
|
||||
param = opal_argv_join(argv, ' ');
|
||||
opal_output(0, "%s plm:rsh: final bootproxy cmd:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param);
|
||||
if (NULL != param) free(param);
|
||||
|
||||
/* fork a child to exec the rsh/ssh session */
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
|
||||
/* child */
|
||||
if (pid == 0) {
|
||||
/* close all file descriptors w/ exception of stdin/stdout/stderr */
|
||||
for(fd=3; fd<fdmax; fd++)
|
||||
close(fd);
|
||||
|
||||
/* Set signal handlers back to the default. Do this close
|
||||
to the execve() because the event library may (and likely
|
||||
will) reset them. If we don't do this, the event
|
||||
library may have left some set that, at least on some
|
||||
OS's, don't get reset via fork() or exec(). Hence, the
|
||||
orted could be unkillable (for example). */
|
||||
|
||||
set_handler_default(SIGTERM);
|
||||
set_handler_default(SIGINT);
|
||||
set_handler_default(SIGHUP);
|
||||
set_handler_default(SIGPIPE);
|
||||
set_handler_default(SIGCHLD);
|
||||
|
||||
/* Unblock all signals, for many of the same reasons that
|
||||
we set the default handlers, above. This is noticable
|
||||
on Linux where the event library blocks SIGTERM, but we
|
||||
don't want that blocked by the orted (or, more
|
||||
specifically, we don't want it to be blocked by the
|
||||
orted and then inherited by the ORTE processes that it
|
||||
forks, making them unkillable by SIGTERM). */
|
||||
sigprocmask(0, 0, &sigs);
|
||||
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
||||
|
||||
/* exec the slave */
|
||||
execv(exec_path, argv);
|
||||
opal_output(0, "plm:rsh: execv of %s failed with errno=%s(%d)\n",
|
||||
exec_path, strerror(errno), errno);
|
||||
exit(-1);
|
||||
} else {
|
||||
/* parent waits to hear that slave is running */
|
||||
ack_recvd = false;
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
|
||||
ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
|
||||
|
||||
ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1);
|
||||
/* cleanup */
|
||||
free(exec_path);
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Take a colon-delimited list of agents and locate the first one that
|
||||
* we are able to find in the PATH. Split that one into argv and
|
||||
* return it. If nothing found, then return NULL.
|
||||
*/
|
||||
static char **search(const char* agent_list)
|
||||
{
|
||||
int i, j;
|
||||
char *line, **lines = opal_argv_split(agent_list, ':');
|
||||
char **tokens, *tmp;
|
||||
char cwd[OMPI_PATH_MAX];
|
||||
|
||||
getcwd(cwd, OMPI_PATH_MAX);
|
||||
for (i = 0; NULL != lines[i]; ++i) {
|
||||
line = lines[i];
|
||||
|
||||
/* Trim whitespace at the beginning and end of the line */
|
||||
for (j = 0; '\0' != line[j] && isspace(line[j]); ++line) {
|
||||
continue;
|
||||
}
|
||||
for (j = strlen(line) - 2; j > 0 && isspace(line[j]); ++j) {
|
||||
line[j] = '\0';
|
||||
}
|
||||
if (strlen(line) <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Split it */
|
||||
tokens = opal_argv_split(line, ' ');
|
||||
|
||||
/* Look for the first token in the PATH */
|
||||
tmp = opal_path_findv(tokens[0], X_OK, environ, cwd);
|
||||
if (NULL != tmp) {
|
||||
free(tokens[0]);
|
||||
tokens[0] = tmp;
|
||||
opal_argv_free(lines);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/* Didn't find it */
|
||||
opal_argv_free(tokens);
|
||||
}
|
||||
|
||||
/* Doh -- didn't find anything */
|
||||
opal_argv_free(lines);
|
||||
return NULL;
|
||||
}
|
@ -55,6 +55,12 @@ typedef struct {
|
||||
orte_jobid_t next_jobid;
|
||||
/* time when daemons started launch */
|
||||
struct timeval daemonlaunchstart;
|
||||
/* rsh launch agent path */
|
||||
char *rsh_agent_path;
|
||||
/* rsh launch agent argv */
|
||||
char **rsh_agent_argv;
|
||||
/* jobid for local slaves */
|
||||
orte_jobid_t local_slaves;
|
||||
} orte_plm_globals_t;
|
||||
/**
|
||||
* Global instance of PLM framework data
|
||||
@ -84,6 +90,12 @@ ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_jobid_t *jobid);
|
||||
|
||||
ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv);
|
||||
|
||||
/**
|
||||
* Local slave launch
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_plm_base_local_slave_launch(orte_job_t *jdata);
|
||||
ORTE_DECLSPEC int orte_plm_base_rsh_launch_agent_setup(void);
|
||||
|
||||
/**
|
||||
* Heartbeat support
|
||||
*/
|
||||
|
@ -69,10 +69,6 @@ struct orte_plm_rsh_component_t {
|
||||
bool disable_qrsh;
|
||||
int delay;
|
||||
int priority;
|
||||
char *agent_param;
|
||||
char** agent_argv;
|
||||
int agent_argc;
|
||||
char* agent_path;
|
||||
bool tree_spawn;
|
||||
opal_list_t children;
|
||||
orte_std_cntr_t num_children;
|
||||
|
@ -38,7 +38,6 @@
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -50,11 +49,6 @@
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/rsh/plm_rsh.h"
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static char **search(const char* agent_list);
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the plm ompi_rsh component version number
|
||||
@ -105,9 +99,6 @@ int orte_plm_rsh_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_plm_rsh_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_plm_rsh_component.cond, opal_condition_t);
|
||||
mca_plm_rsh_component.num_children = 0;
|
||||
mca_plm_rsh_component.agent_argv = NULL;
|
||||
mca_plm_rsh_component.agent_argc = 0;
|
||||
mca_plm_rsh_component.agent_path = NULL;
|
||||
OBJ_CONSTRUCT(&mca_plm_rsh_component.children, opal_list_t);
|
||||
|
||||
/* lookup parameters */
|
||||
@ -142,12 +133,6 @@ int orte_plm_rsh_component_open(void)
|
||||
"If set to 1, assume that the shell on the remote node is the same as the shell on the local node. Otherwise, probe for what the remote shell.",
|
||||
false, false, 1, &tmp);
|
||||
mca_plm_rsh_component.assume_same_shell = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
tmp = mca_base_param_reg_string(c, "agent",
|
||||
"The command used to launch executables on remote nodes (typically either \"ssh\" or \"rsh\")",
|
||||
false, false, "ssh : rsh", NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "pls", "rsh_agent", true);
|
||||
mca_base_param_lookup_string(tmp, &mca_plm_rsh_component.agent_param);
|
||||
|
||||
mca_base_param_reg_int(c, "tree_spawn",
|
||||
"If set to 1, launch via a tree-based topology",
|
||||
@ -160,106 +145,56 @@ int orte_plm_rsh_component_open(void)
|
||||
|
||||
int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
char *bname;
|
||||
size_t i;
|
||||
|
||||
/* Take the string that was given to us by the plm_rsh_agent MCA
|
||||
param and search for it */
|
||||
mca_plm_rsh_component.agent_argv =
|
||||
search(mca_plm_rsh_component.agent_param);
|
||||
mca_plm_rsh_component.agent_argc =
|
||||
opal_argv_count(mca_plm_rsh_component.agent_argv);
|
||||
mca_plm_rsh_component.agent_path = NULL;
|
||||
|
||||
|
||||
/* To be absolutely sure that we are under an SGE parallel env */
|
||||
if (!mca_plm_rsh_component.disable_qrsh &&
|
||||
NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
|
||||
NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) {
|
||||
/* setting exec_argv and exec_path for qrsh */
|
||||
asprintf(&mca_plm_rsh_component.agent_param, "qrsh");
|
||||
asprintf(&mca_plm_rsh_component.agent_path, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
asprintf(&mca_plm_rsh_component.agent_argv[0], "%s/bin/%s/qrsh", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
asprintf(&orte_plm_globals.rsh_agent_path, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
asprintf(&orte_plm_globals.rsh_agent_argv[0], "%s/bin/%s/qrsh", getenv("SGE_ROOT"), getenv("ARC"));
|
||||
/* double check that we have access and permissions for the qrsh agent */
|
||||
if (NULL == opal_path_findv(orte_plm_globals.rsh_agent_argv[0], X_OK,
|
||||
environ, NULL)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rsh: unable to be used: cannot find path "
|
||||
"or execution permissions not set for launching agent \"%s\"\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_plm_globals.rsh_agent_argv[0]));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* automatically add -inherit and grid engine PE related flags */
|
||||
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-inherit");
|
||||
/* Don't use the "-noshell" flag as qrsh would have a problem
|
||||
* swallowing a long command */
|
||||
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-nostdin");
|
||||
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-V");
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
opal_argv_append_nosize(&orte_plm_globals.rsh_agent_argv, "-verbose");
|
||||
opal_output_verbose(1, orte_plm_globals.output,
|
||||
"%s plm:rsh: using %s for launching\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
mca_plm_rsh_component.agent_argv[0]);
|
||||
"%s plm:rsh: using %s for launching\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_plm_globals.rsh_agent_argv[0]);
|
||||
}
|
||||
*priority = mca_plm_rsh_component.priority;
|
||||
*module = (mca_base_module_t *) &orte_plm_rsh_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (mca_plm_rsh_component.agent_argc > 0) {
|
||||
/* If the agent is ssh, and debug was not selected, then
|
||||
automatically add "-x" */
|
||||
|
||||
bname = opal_basename(mca_plm_rsh_component.agent_argv[0]);
|
||||
if (NULL != bname && 0 == strcmp(bname, "ssh")) {
|
||||
/* if xterm option was given, add '-X' */
|
||||
if (NULL != orte_xterm) {
|
||||
opal_argv_append(&mca_plm_rsh_component.agent_argc,
|
||||
&mca_plm_rsh_component.agent_argv, "-X");
|
||||
} else if (0 >= opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
/* if debug was not specified, and the user didn't explicitly
|
||||
* specify X11 forwarding/non-forwarding, add "-x"
|
||||
*/
|
||||
for (i = 1; NULL != mca_plm_rsh_component.agent_argv[i]; ++i) {
|
||||
if (0 == strcasecmp("-x",
|
||||
mca_plm_rsh_component.agent_argv[i])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == mca_plm_rsh_component.agent_argv[i]) {
|
||||
opal_argv_append(&mca_plm_rsh_component.agent_argc,
|
||||
&mca_plm_rsh_component.agent_argv, "-x");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If the agent is qrsh, then automatically add -inherit
|
||||
* and grid engine PE related flags */
|
||||
if (NULL != bname && 0 == strcmp(bname, "qrsh")) {
|
||||
opal_argv_append(&mca_plm_rsh_component.agent_argc,
|
||||
&mca_plm_rsh_component.agent_argv, "-inherit");
|
||||
/* Don't use the "-noshell" flag as qrsh would have a problem
|
||||
* swallowing a long command */
|
||||
opal_argv_append(&mca_plm_rsh_component.agent_argc,
|
||||
&mca_plm_rsh_component.agent_argv, "-nostdin");
|
||||
opal_argv_append(&mca_plm_rsh_component.agent_argc,
|
||||
&mca_plm_rsh_component.agent_argv, "-V");
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
opal_argv_append(&mca_plm_rsh_component.agent_argc,
|
||||
&mca_plm_rsh_component.agent_argv, "-verbose");
|
||||
}
|
||||
}
|
||||
if (NULL != bname) {
|
||||
free(bname);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we didn't find the agent in the path, then don't use this
|
||||
component */
|
||||
if (NULL == mca_plm_rsh_component.agent_argv ||
|
||||
NULL == mca_plm_rsh_component.agent_argv[0]) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rsh: unable to be used: cannot find the "
|
||||
"launching agent. Looked for: %s\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
mca_plm_rsh_component.agent_param));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_plm_rsh_component.agent_path =
|
||||
opal_path_findv(mca_plm_rsh_component.agent_argv[0], X_OK,
|
||||
environ, NULL);
|
||||
if (NULL == mca_plm_rsh_component.agent_path) {
|
||||
|
||||
/* if this isn't an SGE environment, see if rsh/ssh is available */
|
||||
|
||||
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_setup()) {
|
||||
/* this isn't an error - we just cannot be selected */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rsh: unable to be used: cannot find path "
|
||||
"for launching agent \"%s\"\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
mca_plm_rsh_component.agent_argv[0]));
|
||||
orte_plm_globals.rsh_agent_argv[0]));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* we are good - make ourselves available */
|
||||
*priority = mca_plm_rsh_component.priority;
|
||||
*module = (mca_base_module_t *) &orte_plm_rsh_module;
|
||||
return ORTE_SUCCESS;
|
||||
@ -272,63 +207,6 @@ int orte_plm_rsh_component_close(void)
|
||||
OBJ_DESTRUCT(&mca_plm_rsh_component.lock);
|
||||
OBJ_DESTRUCT(&mca_plm_rsh_component.cond);
|
||||
OBJ_DESTRUCT(&mca_plm_rsh_component.children);
|
||||
if (NULL != mca_plm_rsh_component.agent_param) {
|
||||
free(mca_plm_rsh_component.agent_param);
|
||||
}
|
||||
if (NULL != mca_plm_rsh_component.agent_argv) {
|
||||
opal_argv_free(mca_plm_rsh_component.agent_argv);
|
||||
}
|
||||
if (NULL != mca_plm_rsh_component.agent_path) {
|
||||
free(mca_plm_rsh_component.agent_path);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Take a colon-delimited list of agents and locate the first one that
|
||||
* we are able to find in the PATH. Split that one into argv and
|
||||
* return it. If nothing found, then return NULL.
|
||||
*/
|
||||
static char **search(const char* agent_list)
|
||||
{
|
||||
int i, j;
|
||||
char *line, **lines = opal_argv_split(agent_list, ':');
|
||||
char **tokens, *tmp;
|
||||
char cwd[OMPI_PATH_MAX];
|
||||
|
||||
getcwd(cwd, OMPI_PATH_MAX);
|
||||
for (i = 0; NULL != lines[i]; ++i) {
|
||||
line = lines[i];
|
||||
|
||||
/* Trim whitespace at the beginning and end of the line */
|
||||
for (j = 0; '\0' != line[j] && isspace(line[j]); ++line) {
|
||||
continue;
|
||||
}
|
||||
for (j = strlen(line) - 2; j > 0 && isspace(line[j]); ++j) {
|
||||
line[j] = '\0';
|
||||
}
|
||||
if (strlen(line) <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Split it */
|
||||
tokens = opal_argv_split(line, ' ');
|
||||
|
||||
/* Look for the first token in the PATH */
|
||||
tmp = opal_path_findv(tokens[0], X_OK, environ, cwd);
|
||||
if (NULL != tmp) {
|
||||
free(tokens[0]);
|
||||
tokens[0] = tmp;
|
||||
opal_argv_free(lines);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/* Didn't find it */
|
||||
opal_argv_free(tokens);
|
||||
}
|
||||
|
||||
/* Doh -- didn't find anything */
|
||||
opal_argv_free(lines);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -80,6 +80,7 @@
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
@ -147,6 +148,7 @@ static struct timeval joblaunchstart, joblaunchstop;
|
||||
|
||||
/* local global storage */
|
||||
static orte_jobid_t active_job=ORTE_JOBID_INVALID;
|
||||
static orte_jobid_t local_slaves;
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
@ -158,6 +160,13 @@ int orte_plm_rsh_init(void)
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* we set the local slaves up to have a job family of zero.
|
||||
* this provides a convenient way of checking whether or
|
||||
* not a process is a local slave
|
||||
*/
|
||||
local_slaves = 0;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -204,8 +213,8 @@ static int orte_plm_rsh_probe(char *nodename,
|
||||
exit(01);
|
||||
}
|
||||
/* Build argv array */
|
||||
argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
|
||||
argc = mca_plm_rsh_component.agent_argc;
|
||||
argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv);
|
||||
argc = opal_argv_count(orte_plm_globals.rsh_agent_argv);
|
||||
opal_argv_append(&argc, &argv, nodename);
|
||||
opal_argv_append(&argc, &argv, "echo $SHELL");
|
||||
|
||||
@ -345,21 +354,13 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
|
||||
}
|
||||
|
||||
static int setup_launch(int *argcptr, char ***argvptr,
|
||||
char *nodename,
|
||||
int *node_name_index1,
|
||||
int *proc_vpid_index, char *prefix_dir)
|
||||
static int setup_shell(orte_plm_rsh_shell_t *rshell,
|
||||
orte_plm_rsh_shell_t *lshell,
|
||||
char *nodename, int *argc, char ***argv)
|
||||
{
|
||||
struct passwd *p;
|
||||
int argc;
|
||||
char **argv;
|
||||
char *param;
|
||||
orte_plm_rsh_shell_t remote_shell, local_shell;
|
||||
char *lib_base, *bin_base;
|
||||
int orted_argc;
|
||||
char **orted_argv;
|
||||
char *orted_cmd, *orted_prefix, *final_cmd;
|
||||
int orted_index;
|
||||
struct passwd *p;
|
||||
char *param;
|
||||
int rc;
|
||||
|
||||
/* What is our local shell? */
|
||||
@ -409,7 +410,7 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
}
|
||||
|
||||
if (ORTE_PLM_RSH_SHELL_UNKNOWN == remote_shell) {
|
||||
opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n");
|
||||
opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n");
|
||||
remote_shell = ORTE_PLM_RSH_SHELL_BASH;
|
||||
}
|
||||
}
|
||||
@ -419,6 +420,52 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
remote_shell, orte_plm_rsh_shell_name[remote_shell]));
|
||||
|
||||
/* Do we need to source .profile on the remote side?
|
||||
- sh: yes (see bash(1))
|
||||
- ksh: yes (see ksh(1))
|
||||
- bash: no (see bash(1))
|
||||
- [t]csh: no (see csh(1) and tcsh(1))
|
||||
- zsh: no (see http://zsh.sourceforge.net/FAQ/zshfaq03.html#l19)
|
||||
*/
|
||||
|
||||
if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
|
||||
ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
|
||||
int i;
|
||||
char **tmp;
|
||||
tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' ');
|
||||
if (NULL == tmp) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
for (i = 0; NULL != tmp[i]; ++i) {
|
||||
opal_argv_append(argc, argv, tmp[i]);
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
}
|
||||
|
||||
/* pass results back */
|
||||
*rshell = remote_shell;
|
||||
*lshell = local_shell;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int setup_launch(int *argcptr, char ***argvptr,
|
||||
char *nodename,
|
||||
int *node_name_index1,
|
||||
int *proc_vpid_index, char *prefix_dir)
|
||||
{
|
||||
int argc;
|
||||
char **argv;
|
||||
char *param;
|
||||
orte_plm_rsh_shell_t remote_shell, local_shell;
|
||||
char *lib_base, *bin_base;
|
||||
int orted_argc;
|
||||
char **orted_argv;
|
||||
char *orted_cmd, *orted_prefix, *final_cmd;
|
||||
int orted_index;
|
||||
int rc;
|
||||
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. This
|
||||
requires some explanation:
|
||||
|
||||
@ -452,31 +499,16 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
/*
|
||||
* Build argv array
|
||||
*/
|
||||
argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
|
||||
argc = mca_plm_rsh_component.agent_argc;
|
||||
argv = opal_argv_copy(orte_plm_globals.rsh_agent_argv);
|
||||
argc = opal_argv_count(orte_plm_globals.rsh_agent_argv);
|
||||
*node_name_index1 = argc;
|
||||
opal_argv_append(&argc, &argv, "<template>");
|
||||
|
||||
/* Do we need to source .profile on the remote side?
|
||||
- sh: yes (see bash(1))
|
||||
- ksh: yes (see ksh(1))
|
||||
- bash: no (see bash(1))
|
||||
- [t]csh: no (see csh(1) and tcsh(1))
|
||||
- zsh: no (see http://zsh.sourceforge.net/FAQ/zshfaq03.html#l19)
|
||||
*/
|
||||
|
||||
if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
|
||||
ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
|
||||
int i;
|
||||
char **tmp;
|
||||
tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' ');
|
||||
if (NULL == tmp) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
for (i = 0; NULL != tmp[i]; ++i) {
|
||||
opal_argv_append(&argc, &argv, tmp[i]);
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
/* setup the correct shell info */
|
||||
if (ORTE_SUCCESS != (rc = setup_shell(&remote_shell, &local_shell,
|
||||
nodename, &argc, &argv))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* now get the orted cmd - as specified by user - into our tmp array.
|
||||
@ -706,7 +738,7 @@ static void ssh_child(int argc, char **argv,
|
||||
* about remote launches here
|
||||
*/
|
||||
exec_argv = argv;
|
||||
exec_path = strdup(mca_plm_rsh_component.agent_path);
|
||||
exec_path = strdup(orte_plm_globals.rsh_agent_path);
|
||||
|
||||
/* pass the vpid */
|
||||
rc = orte_util_convert_vpid_to_string(&var, vpid);
|
||||
@ -942,6 +974,17 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
* directly ssh the slave process itself. No mapping
|
||||
* is performed to support this - the caller must
|
||||
* provide all the info required to launch the job,
|
||||
* including the target hosts
|
||||
*/
|
||||
return orte_plm_base_local_slave_launch(jdata);
|
||||
}
|
||||
|
||||
/* default to declaring the daemon launch as having failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
@ -1453,3 +1496,4 @@ static orte_plm_rsh_shell_t find_shell(char *shell)
|
||||
/* We didn't find it */
|
||||
return ORTE_PLM_RSH_SHELL_UNKNOWN;
|
||||
}
|
||||
|
||||
|
@ -39,3 +39,13 @@ the map for this application. This can be caused by a lack of
|
||||
an allocation, or by an error in the Open MPI code. Please check
|
||||
to ensure you have a SLURM allocation. If you do, then please pass
|
||||
the error to the Open MPI user's mailing list for assistance.
|
||||
#
|
||||
[no-local-slave-support]
|
||||
A call was made to launch a local slave process, but no support
|
||||
is available for doing so. Launching a local slave requires support
|
||||
for either rsh or ssh on the backend nodes where MPI processes
|
||||
are running.
|
||||
|
||||
Please consult with your system administrator about obtaining
|
||||
such support.
|
||||
|
||||
|
@ -104,7 +104,7 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
|
||||
static pid_t srun_pid = 0;
|
||||
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
|
||||
static bool failed_launch;
|
||||
|
||||
static bool local_launch_available = false;
|
||||
|
||||
/**
|
||||
* Init the module
|
||||
@ -116,6 +116,11 @@ static int plm_slurm_init(void)
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS == orte_plm_base_rsh_launch_agent_setup()) {
|
||||
local_launch_available = true;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -148,6 +153,22 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
* directly ssh the slave process itself. No mapping
|
||||
* is performed to support this - the caller must
|
||||
* provide all the info required to launch the job,
|
||||
* including the target hosts
|
||||
*/
|
||||
if (!local_launch_available) {
|
||||
/* if we can't support this, then abort */
|
||||
orte_show_help("help-plm-slurm.txt", "no-local-slave-support", true);
|
||||
return ORTE_ERR_FAILED_TO_START;
|
||||
}
|
||||
return orte_plm_base_local_slave_launch(jdata);
|
||||
}
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
|
||||
@ -328,7 +349,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* If not yet set, copy it; iff set, then it's the
|
||||
same anyway */
|
||||
* same anyway
|
||||
*/
|
||||
if (NULL == cur_prefix) {
|
||||
cur_prefix = strdup(app_prefix_dir);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
@ -348,6 +370,11 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
free(nodelist_flat);
|
||||
free(var);
|
||||
|
||||
/* enable local launch by the orteds */
|
||||
var = mca_base_param_environ_variable("plm", NULL, NULL);
|
||||
opal_setenv(var, "rsh", true, &env);
|
||||
free(var);
|
||||
|
||||
/* exec the daemon(s) */
|
||||
if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -50,3 +50,13 @@ If you do not understand this error mesage, please try the following:
|
||||
2. Use the --prefix option to indicate where we can
|
||||
find that executable
|
||||
3. Talk to your local system administrator
|
||||
#
|
||||
[no-local-slave-support]
|
||||
A call was made to launch a local slave process, but no support
|
||||
is available for doing so. Launching a local slave requires support
|
||||
for either rsh or ssh on the backend nodes where MPI processes
|
||||
are running.
|
||||
|
||||
Please consult with your system administrator about obtaining
|
||||
such support.
|
||||
|
||||
|
@ -92,6 +92,7 @@ static void failed_start(int fd, short event, void *arg);
|
||||
* Local "global" variables
|
||||
*/
|
||||
static opal_event_t *ev=NULL;
|
||||
static bool local_launch_available = false;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
@ -117,6 +118,11 @@ static int plm_tm_init(void)
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS == orte_plm_base_rsh_launch_agent_setup()) {
|
||||
local_launch_available = true;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -148,6 +154,22 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
* directly ssh the slave process itself. No mapping
|
||||
* is performed to support this - the caller must
|
||||
* provide all the info required to launch the job,
|
||||
* including the target hosts
|
||||
*/
|
||||
if (!local_launch_available) {
|
||||
/* if we can't support this, then abort */
|
||||
orte_show_help("help-plm-tm.txt", "no-local-slave-support", true);
|
||||
return ORTE_ERR_FAILED_TO_START;
|
||||
}
|
||||
return orte_plm_base_local_slave_launch(jdata);
|
||||
}
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
|
||||
@ -233,6 +255,11 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
/* enable local launch by the orteds */
|
||||
var = mca_base_param_environ_variable("plm", NULL, NULL);
|
||||
opal_setenv(var, "rsh", true, &env);
|
||||
free(var);
|
||||
|
||||
/* add our umask -- see big note in orted.c */
|
||||
current_umask = umask(0);
|
||||
umask(current_umask);
|
||||
|
@ -119,6 +119,9 @@ char *orte_xterm;
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
bool orte_forward_job_control;
|
||||
|
||||
/* rsh support */
|
||||
char *orte_rsh_agent;
|
||||
|
||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||
|
||||
int orte_debug_output = -1;
|
||||
|
@ -496,6 +496,9 @@ ORTE_DECLSPEC extern char *orte_output_filename;
|
||||
/* generate new xterm windows to display output from specified ranks */
|
||||
ORTE_DECLSPEC extern char *orte_xterm;
|
||||
|
||||
/* rsh support */
|
||||
ORTE_DECLSPEC extern char *orte_rsh_agent;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -38,7 +38,7 @@
|
||||
|
||||
int orte_register_params(void)
|
||||
{
|
||||
int value;
|
||||
int value, tmp;
|
||||
|
||||
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
||||
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
||||
@ -269,7 +269,15 @@ int orte_register_params(void)
|
||||
false, false,
|
||||
(int) false, &value);
|
||||
orte_forward_job_control = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
|
||||
/* local rsh/ssh launch agent */
|
||||
tmp = mca_base_param_reg_string_name("orte", "rsh_agent",
|
||||
"The command used to launch executables on remote nodes (typically either \"ssh\" or \"rsh\")",
|
||||
false, false, "ssh : rsh", NULL);
|
||||
mca_base_param_reg_syn_name(tmp, "pls", "rsh_agent", true);
|
||||
mca_base_param_reg_syn_name(tmp, "plm", "rsh_agent", true);
|
||||
mca_base_param_lookup_string(tmp, &orte_rsh_agent);
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -13,9 +13,13 @@ var=$1
|
||||
|
||||
# push all MCA params to the environment
|
||||
while [ "${var:0:5}" = "OMPI_" ]; do
|
||||
export $var
|
||||
shift 1
|
||||
var=$1
|
||||
if [ "${var:6:6}" = "PREFIX" ]; then
|
||||
export LD_LIBRARY_PATH="${var:12}":$LD_LIBRARY_PATH
|
||||
else
|
||||
export $var
|
||||
shift 1
|
||||
var=$1
|
||||
fi
|
||||
done
|
||||
|
||||
# extract the application to be executed
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user