Merge pull request #3217 from rhc54/topic/wdirs
Resolve a race condition for setting our working directory when fork/exec'ing application procs.
Этот коммит содержится в:
Коммит
10d401b6ec
@ -144,11 +144,7 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
|
||||
static void send_error_show_help(int fd, int exit_status,
|
||||
const char *file, const char *topic, ...)
|
||||
__opal_attribute_noreturn__;
|
||||
static int do_child(orte_proc_t *child,
|
||||
char *app, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat, int write_fd,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||
__opal_attribute_noreturn__;
|
||||
|
||||
|
||||
@ -342,20 +338,15 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int do_child( orte_proc_t *child,
|
||||
char *app, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat, int write_fd,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||
{
|
||||
int i, rc;
|
||||
int i;
|
||||
sigset_t sigs;
|
||||
char *param, *msg;
|
||||
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
opal_fd_set_cloexec(write_fd);
|
||||
|
||||
if (NULL != child) {
|
||||
if (NULL != cd->child) {
|
||||
/* setup stdout/stderr so that any error messages that we
|
||||
may print out will get displayed back at orterun.
|
||||
|
||||
@ -369,20 +360,19 @@ static int do_child( orte_proc_t *child,
|
||||
always outputs a nice, single message indicating what
|
||||
happened
|
||||
*/
|
||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
||||
&environ_copy))) {
|
||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
|
||||
ORTE_ERROR_LOG(i);
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-alps.txt",
|
||||
"iof setup failed",
|
||||
orte_process_info.nodename, app);
|
||||
orte_process_info.nodename, cd->app->app);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
/* now set any child-level controls such as binding */
|
||||
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
|
||||
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
|
||||
|
||||
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
/* tie stdin/out/err/internal to /dev/null */
|
||||
int fdnull;
|
||||
for (i=0; i < 3; i++) {
|
||||
@ -393,24 +383,24 @@ static int do_child( orte_proc_t *child,
|
||||
close(fdnull);
|
||||
}
|
||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||
if (fdnull > opts.p_internal[1]) {
|
||||
dup2(fdnull, opts.p_internal[1]);
|
||||
if (fdnull > cd->opts.p_internal[1]) {
|
||||
dup2(fdnull, cd->opts.p_internal[1]);
|
||||
}
|
||||
close(fdnull);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
|
||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
|
||||
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
|
||||
"close fds",
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app->app,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
if (argv == NULL) {
|
||||
argv = malloc(sizeof(char*)*2);
|
||||
argv[0] = strdup(app);
|
||||
argv[1] = NULL;
|
||||
if (cd->argv == NULL) {
|
||||
cd->argv = malloc(sizeof(char*)*2);
|
||||
cd->argv[0] = strdup(cd->app->app);
|
||||
cd->argv[1] = NULL;
|
||||
}
|
||||
|
||||
/* Set signal handlers back to the default. Do this close to
|
||||
@ -437,37 +427,33 @@ static int do_child( orte_proc_t *child,
|
||||
|
||||
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||
int jout;
|
||||
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
|
||||
for (jout=0; NULL != argv[jout]; jout++) {
|
||||
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
|
||||
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
|
||||
for (jout=0; NULL != cd->argv[jout]; jout++) {
|
||||
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
|
||||
}
|
||||
for (jout=0; NULL != environ_copy[jout]; jout++) {
|
||||
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
|
||||
for (jout=0; NULL != cd->env[jout]; jout++) {
|
||||
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
|
||||
}
|
||||
}
|
||||
|
||||
execve(app, argv, environ_copy);
|
||||
execve(cd->app->app, cd->argv, cd->env);
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-alps.txt", "execve error",
|
||||
orte_process_info.nodename, app, strerror(errno));
|
||||
orte_process_info.nodename, cd->app->app, strerror(errno));
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
|
||||
static int do_parent(orte_proc_t *child,
|
||||
char *app, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat, int read_fd,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
|
||||
{
|
||||
int rc;
|
||||
orte_odls_pipe_err_msg_t msg;
|
||||
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
||||
|
||||
close(opts.p_stdin[0]);
|
||||
close(opts.p_stdout[1]);
|
||||
close(opts.p_stderr[1]);
|
||||
close(opts.p_internal[1]);
|
||||
close(cd->opts.p_stdin[0]);
|
||||
close(cd->opts.p_stdout[1]);
|
||||
close(cd->opts.p_stderr[1]);
|
||||
close(cd->opts.p_internal[1]);
|
||||
|
||||
/* Block reading a message from the pipe */
|
||||
while (1) {
|
||||
@ -483,18 +469,18 @@ static int do_parent(orte_proc_t *child,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
close(read_fd);
|
||||
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Otherwise, we got a warning or error message from the child */
|
||||
if (NULL != child) {
|
||||
if (NULL != cd->child) {
|
||||
if (msg.fatal) {
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
} else {
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -504,10 +490,10 @@ static int do_parent(orte_proc_t *child,
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||
true,
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app,
|
||||
"opal_fd_read", __FILE__, __LINE__);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -518,10 +504,10 @@ static int do_parent(orte_proc_t *child,
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||
true,
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app,
|
||||
"opal_fd_read", __FILE__, __LINE__);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -532,10 +518,10 @@ static int do_parent(orte_proc_t *child,
|
||||
if (NULL == str) {
|
||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||
true,
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app,
|
||||
"opal_fd_read", __FILE__, __LINE__);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -556,9 +542,9 @@ static int do_parent(orte_proc_t *child,
|
||||
closed, indicating that the child launched
|
||||
successfully). */
|
||||
if (msg.fatal) {
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
close(read_fd);
|
||||
return ORTE_ERR_FAILED_TO_START;
|
||||
@ -568,9 +554,9 @@ static int do_parent(orte_proc_t *child,
|
||||
/* If we got here, it means that the pipe closed without
|
||||
indication of a fatal error, meaning that the child process
|
||||
launched successfully. */
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_RUNNING;
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_RUNNING;
|
||||
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
close(read_fd);
|
||||
|
||||
@ -581,14 +567,10 @@ static int do_parent(orte_proc_t *child,
|
||||
/**
|
||||
* Fork/exec the specified processes
|
||||
*/
|
||||
static int odls_alps_fork_local_proc(orte_proc_t *child,
|
||||
char *app,
|
||||
char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int odls_alps_fork_local_proc(void *cdptr)
|
||||
{
|
||||
int rc, p[2];
|
||||
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
|
||||
int p[2];
|
||||
pid_t pid;
|
||||
|
||||
/* A pipe is used to communicate between the parent and child to
|
||||
@ -601,24 +583,24 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
|
||||
the pipe, then the child was letting us know why it failed. */
|
||||
if (pipe(p) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
|
||||
}
|
||||
return ORTE_ERR_SYS_LIMITS_PIPES;
|
||||
}
|
||||
|
||||
/* Fork off the child */
|
||||
pid = fork();
|
||||
if (NULL != child) {
|
||||
child->pid = pid;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->pid = pid;
|
||||
}
|
||||
|
||||
if (pid < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
@ -628,12 +610,12 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
|
||||
#if HAVE_SETPGID
|
||||
setpgid(0, 0);
|
||||
#endif
|
||||
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
|
||||
do_child(cd, p[1]);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
close(p[1]);
|
||||
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
|
||||
return do_parent(cd, p[0]);
|
||||
}
|
||||
|
||||
|
||||
@ -643,8 +625,8 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
|
||||
|
||||
int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
|
||||
{
|
||||
int rc;
|
||||
orte_jobid_t job;
|
||||
int rc;
|
||||
|
||||
/* construct the list of children we are to launch */
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
|
||||
@ -729,4 +711,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -507,7 +507,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int setup_path(orte_app_context_t *app)
|
||||
static int setup_path(orte_app_context_t *app, char **wdir)
|
||||
{
|
||||
int rc;
|
||||
char dir[MAXPATHLEN];
|
||||
@ -539,9 +539,12 @@ static int setup_path(orte_app_context_t *app)
|
||||
* ensuring they start out matching.
|
||||
*/
|
||||
getcwd(dir, sizeof(dir));
|
||||
*wdir = strdup(dir);
|
||||
opal_setenv("PWD", dir, true, &app->env);
|
||||
/* update the initial wdir value too */
|
||||
opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
|
||||
} else {
|
||||
*wdir = NULL;
|
||||
}
|
||||
|
||||
/* Search for the OMPI_exec_path and PATH settings in the environment. */
|
||||
@ -631,13 +634,12 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
orte_job_t *jobdat = cd->jdata;
|
||||
orte_app_context_t *app = cd->app;
|
||||
orte_proc_t *child = cd->child;
|
||||
char **env = NULL, **argv = NULL, *cmd = NULL;
|
||||
int rc, i;
|
||||
bool found;
|
||||
orte_proc_state_t state;
|
||||
|
||||
/* thread-protect common values */
|
||||
env = opal_argv_copy(app->env);
|
||||
cd->env = opal_argv_copy(app->env);
|
||||
|
||||
/* ensure we clear any prior info regarding state or exit status in
|
||||
* case this is a restart
|
||||
@ -646,7 +648,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
|
||||
|
||||
/* setup the pmix environment */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
goto errorout;
|
||||
@ -680,16 +682,16 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
child->name.vpid == nm->name.vpid) {
|
||||
/* we want this one - modify the app's command to include
|
||||
* the orte xterm cmd that starts with the xtermcmd */
|
||||
argv = opal_argv_copy(orte_odls_globals.xtermcmd);
|
||||
cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
|
||||
/* insert the rank into the correct place as a window title */
|
||||
free(argv[2]);
|
||||
asprintf(&argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
|
||||
free(cd->argv[2]);
|
||||
asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
|
||||
/* add in the argv from the app */
|
||||
for (i=0; NULL != app->argv[i]; i++) {
|
||||
opal_argv_append_nosize(&argv, app->argv[i]);
|
||||
opal_argv_append_nosize(&cd->argv, app->argv[i]);
|
||||
}
|
||||
/* use the xterm cmd as the app string */
|
||||
cmd = strdup(orte_odls_globals.xtermcmd[0]);
|
||||
cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
|
||||
found = true;
|
||||
break;
|
||||
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
|
||||
@ -703,21 +705,21 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
cmd = strdup(app->app);
|
||||
argv = opal_argv_copy(app->argv);
|
||||
cd->cmd = strdup(app->app);
|
||||
cd->argv = opal_argv_copy(app->argv);
|
||||
}
|
||||
} else if (NULL != orte_fork_agent) {
|
||||
/* we were given a fork agent - use it */
|
||||
argv = opal_argv_copy(orte_fork_agent);
|
||||
cd->argv = opal_argv_copy(orte_fork_agent);
|
||||
/* add in the argv from the app */
|
||||
for (i=0; NULL != app->argv[i]; i++) {
|
||||
opal_argv_append_nosize(&argv, app->argv[i]);
|
||||
opal_argv_append_nosize(&cd->argv, app->argv[i]);
|
||||
}
|
||||
/* the app exe name itself is in the argvsav array, so
|
||||
* we can recover it from there later
|
||||
*/
|
||||
cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
||||
if (NULL == cmd) {
|
||||
cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
||||
if (NULL == cd->cmd) {
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:fork-agent-not-found",
|
||||
true, orte_process_info.nodename, orte_fork_agent[0]);
|
||||
@ -725,14 +727,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
goto errorout;
|
||||
}
|
||||
} else {
|
||||
cmd = strdup(app->app);
|
||||
argv = opal_argv_copy(app->argv);
|
||||
cd->cmd = strdup(app->app);
|
||||
cd->argv = opal_argv_copy(app->argv);
|
||||
}
|
||||
|
||||
/* setup the rest of the environment with the proc-specific items - these
|
||||
* will be overwritten for each child
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
goto errorout;
|
||||
@ -741,9 +743,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
/* if we are indexing the argv by rank, do so now */
|
||||
if (cd->index_argv) {
|
||||
char *param;
|
||||
asprintf(¶m, "%s-%d", argv[0], (int)child->name.vpid);
|
||||
free(argv[0]);
|
||||
argv[0] = param;
|
||||
asprintf(¶m, "%s-%d", cd->argv[0], (int)child->name.vpid);
|
||||
free(cd->argv[0]);
|
||||
cd->argv[0] = param;
|
||||
}
|
||||
|
||||
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||
@ -757,37 +759,19 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
|
||||
if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
|
||||
/* error message already output */
|
||||
state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
goto errorout;
|
||||
}
|
||||
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != cmd) {
|
||||
free(cmd);
|
||||
}
|
||||
OBJ_RELEASE(cd);
|
||||
return;
|
||||
|
||||
errorout:
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != cmd) {
|
||||
free(cmd);
|
||||
}
|
||||
OBJ_RELEASE(cd);
|
||||
}
|
||||
|
||||
@ -807,6 +791,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
char *msg;
|
||||
orte_odls_spawn_caddy_t *cd;
|
||||
opal_event_base_t *evb;
|
||||
char *effective_dir = NULL;
|
||||
|
||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||
"%s local:launch",
|
||||
@ -945,7 +930,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
/* setup the working directory for this app - will jump us
|
||||
* to that directory
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = setup_path(app))) {
|
||||
if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:launch:setup_path failed with error %s(%d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1009,6 +994,15 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
goto GETOUT;
|
||||
}
|
||||
|
||||
/* reset our working directory back to our default location - if we
|
||||
* don't do this, then we will be looking for relative paths starting
|
||||
* from the last wdir option specified by the user. Thus, we would
|
||||
* be requiring that the user keep track on the cmd line of where
|
||||
* each app was located relative to the prior app, instead of relative
|
||||
* to their current location
|
||||
*/
|
||||
chdir(basedir);
|
||||
|
||||
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
|
||||
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||
@ -1066,6 +1060,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
|
||||
/* dispatch this child to the next available launch thread */
|
||||
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
|
||||
if (NULL != effective_dir) {
|
||||
cd->wdir = strdup(effective_dir);
|
||||
}
|
||||
cd->jdata = jobdat;
|
||||
cd->app = app;
|
||||
cd->child = child;
|
||||
@ -1114,14 +1111,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||
|
||||
}
|
||||
/* reset our working directory back to our default location - if we
|
||||
* don't do this, then we will be looking for relative paths starting
|
||||
* from the last wdir option specified by the user. Thus, we would
|
||||
* be requiring that the user keep track on the cmd line of where
|
||||
* each app was located relative to the prior app, instead of relative
|
||||
* to their current location
|
||||
*/
|
||||
chdir(basedir);
|
||||
if (NULL != effective_dir) {
|
||||
free(effective_dir);
|
||||
}
|
||||
}
|
||||
|
||||
GETOUT:
|
||||
@ -1682,7 +1674,9 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
||||
orte_app_context_t *app;
|
||||
orte_job_t *jobdat;
|
||||
char basedir[MAXPATHLEN];
|
||||
orte_iof_base_io_conf_t opts;
|
||||
char *wdir = NULL;
|
||||
orte_odls_spawn_caddy_t *cd;
|
||||
opal_event_base_t *evb;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:restart_proc for proc %s",
|
||||
@ -1720,29 +1714,65 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
||||
}
|
||||
|
||||
/* setup the path */
|
||||
if (ORTE_SUCCESS != (rc = setup_path(app))) {
|
||||
if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* dispatch this child to the next available launch thread */
|
||||
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
|
||||
if (NULL != wdir) {
|
||||
cd->wdir = strdup(wdir);
|
||||
free(wdir);
|
||||
}
|
||||
cd->jdata = jobdat;
|
||||
cd->app = app;
|
||||
cd->child = child;
|
||||
cd->fork_local = fork_local;
|
||||
/* setup any IOF */
|
||||
memset(&opts, 0, sizeof(orte_iof_base_io_conf_t));
|
||||
cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
||||
|
||||
/* do we want to setup stdin? */
|
||||
if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
|
||||
child->name.vpid == jobdat->stdin_target) {
|
||||
cd->opts.connect_stdin = true;
|
||||
} else {
|
||||
cd->opts.connect_stdin = false;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
child->exit_code = rc;
|
||||
OBJ_RELEASE(cd);
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
/* connect endpoints IOF */
|
||||
rc = orte_iof_base_setup_parent(&child->name, &opts);
|
||||
rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
OBJ_RELEASE(cd);
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||
|
||||
++orte_odls_globals.next_base;
|
||||
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
|
||||
orte_odls_globals.next_base = 0;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s restarting app %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
||||
|
||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||
if (ORTE_SUCCESS != (rc = fork_local(child, app->app, app->argv, app->env, jobdat, opts))) {
|
||||
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
|
||||
opal_event_set(evb, &cd->ev, -1,
|
||||
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
|
||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = fork_local(cd))) {
|
||||
orte_wait_cb_cancel(child);
|
||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
|
@ -239,7 +239,26 @@ OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
|
||||
static void sccon(orte_odls_spawn_caddy_t *p)
|
||||
{
|
||||
memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
|
||||
p->cmd = NULL;
|
||||
p->wdir = NULL;
|
||||
p->argv = NULL;
|
||||
p->env = NULL;
|
||||
}
|
||||
static void scdes(orte_odls_spawn_caddy_t *p)
|
||||
{
|
||||
if (NULL != p->cmd) {
|
||||
free(p->cmd);
|
||||
}
|
||||
if (NULL != p->wdir) {
|
||||
free(p->wdir);
|
||||
}
|
||||
if (NULL != p->argv) {
|
||||
opal_argv_free(p->argv);
|
||||
}
|
||||
if (NULL != p->env) {
|
||||
opal_argv_free(p->env);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
|
||||
opal_object_t,
|
||||
sccon, NULL);
|
||||
sccon, scdes);
|
||||
|
@ -82,16 +82,16 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
ORTE_DECLSPEC void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata);
|
||||
|
||||
/* define a function that will fork a local proc */
|
||||
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_proc_t *child,
|
||||
char *app, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jdata,
|
||||
orte_iof_base_io_conf_t opts);
|
||||
typedef int (*orte_odls_base_fork_local_proc_fn_t)(void *cd);
|
||||
|
||||
/* define an object for fork/exec the local proc */
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
opal_event_t ev;
|
||||
char *cmd;
|
||||
char *wdir;
|
||||
char **argv;
|
||||
char **env;
|
||||
orte_job_t *jdata;
|
||||
orte_app_context_t *app;
|
||||
orte_proc_t *child;
|
||||
|
@ -145,11 +145,7 @@ static void send_error_show_help(int fd, int exit_status,
|
||||
const char *file, const char *topic, ...)
|
||||
__opal_attribute_noreturn__;
|
||||
|
||||
static int do_child(orte_proc_t *child,
|
||||
char *cmd, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat, int write_fd,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||
__opal_attribute_noreturn__;
|
||||
|
||||
|
||||
@ -319,11 +315,7 @@ static int close_open_file_descriptors(int write_fd,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int do_child(orte_proc_t *child,
|
||||
char *app, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat, int write_fd,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||
{
|
||||
int i;
|
||||
sigset_t sigs;
|
||||
@ -339,7 +331,7 @@ static int do_child(orte_proc_t *child,
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
opal_fd_set_cloexec(write_fd);
|
||||
|
||||
if (NULL != child) {
|
||||
if (NULL != cd->child) {
|
||||
/* setup stdout/stderr so that any error messages that we
|
||||
may print out will get displayed back at orterun.
|
||||
|
||||
@ -353,22 +345,21 @@ static int do_child(orte_proc_t *child,
|
||||
always outputs a nice, single message indicating what
|
||||
happened
|
||||
*/
|
||||
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
||||
&environ_copy))) {
|
||||
if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
|
||||
ORTE_ERROR_LOG(i);
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-default.txt",
|
||||
"iof setup failed",
|
||||
orte_process_info.nodename, app);
|
||||
orte_process_info.nodename, cd->app->app);
|
||||
/* Does not return */
|
||||
}
|
||||
}
|
||||
|
||||
/* now set any child-level controls such as binding */
|
||||
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
|
||||
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
|
||||
|
||||
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||
/* tie stdin/out/err/internal to /dev/null */
|
||||
int fdnull;
|
||||
for (i=0; i < 3; i++) {
|
||||
@ -379,8 +370,8 @@ static int do_child(orte_proc_t *child,
|
||||
close(fdnull);
|
||||
}
|
||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||
if (fdnull > opts.p_internal[1]) {
|
||||
dup2(fdnull, opts.p_internal[1]);
|
||||
if (fdnull > cd->opts.p_internal[1]) {
|
||||
dup2(fdnull, cd->opts.p_internal[1]);
|
||||
}
|
||||
close(fdnull);
|
||||
}
|
||||
@ -388,19 +379,19 @@ static int do_child(orte_proc_t *child,
|
||||
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
|
||||
the pipe used for the IOF INTERNAL messages, and the pipe up to
|
||||
the parent. */
|
||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
|
||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
|
||||
// close *all* file descriptors -- slow
|
||||
for(fd=3; fd<fdmax; fd++) {
|
||||
if (fd != opts.p_internal[1] && fd != write_fd) {
|
||||
if (fd != cd->opts.p_internal[1] && fd != write_fd) {
|
||||
close(fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (argv == NULL) {
|
||||
argv = malloc(sizeof(char*)*2);
|
||||
argv[0] = strdup(app);
|
||||
argv[1] = NULL;
|
||||
if (cd->argv == NULL) {
|
||||
cd->argv = malloc(sizeof(char*)*2);
|
||||
cd->argv[0] = strdup(cd->app->app);
|
||||
cd->argv[1] = NULL;
|
||||
}
|
||||
|
||||
/* Set signal handlers back to the default. Do this close to
|
||||
@ -423,31 +414,31 @@ static int do_child(orte_proc_t *child,
|
||||
sigprocmask(0, 0, &sigs);
|
||||
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
||||
|
||||
/* Exec the new executable */
|
||||
/* take us to the correct wdir */
|
||||
if (NULL != cd->wdir) {
|
||||
chdir(cd->wdir);
|
||||
}
|
||||
|
||||
execve(app, argv, environ_copy);
|
||||
/* Exec the new executable */
|
||||
execve(cd->app->app, cd->argv, cd->env);
|
||||
getcwd(dir, sizeof(dir));
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-default.txt", "execve error",
|
||||
orte_process_info.nodename, dir, app, strerror(errno));
|
||||
orte_process_info.nodename, dir, cd->app->app, strerror(errno));
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
|
||||
static int do_parent(orte_proc_t *child,
|
||||
char *app, char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat, int read_fd,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
|
||||
{
|
||||
int rc;
|
||||
orte_odls_pipe_err_msg_t msg;
|
||||
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
||||
|
||||
close(opts.p_stdin[0]);
|
||||
close(opts.p_stdout[1]);
|
||||
close(opts.p_stderr[1]);
|
||||
close(opts.p_internal[1]);
|
||||
close(cd->opts.p_stdin[0]);
|
||||
close(cd->opts.p_stdout[1]);
|
||||
close(cd->opts.p_stderr[1]);
|
||||
close(cd->opts.p_internal[1]);
|
||||
|
||||
/* Block reading a message from the pipe */
|
||||
while (1) {
|
||||
@ -463,18 +454,18 @@ static int do_parent(orte_proc_t *child,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
close(read_fd);
|
||||
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Otherwise, we got a warning or error message from the child */
|
||||
if (NULL != child) {
|
||||
if (NULL != cd->child) {
|
||||
if (msg.fatal) {
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
} else {
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -484,10 +475,10 @@ static int do_parent(orte_proc_t *child,
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||
true,
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app->app,
|
||||
"opal_fd_read", __FILE__, __LINE__);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -498,10 +489,10 @@ static int do_parent(orte_proc_t *child,
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||
true,
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app->app,
|
||||
"opal_fd_read", __FILE__, __LINE__);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -512,10 +503,10 @@ static int do_parent(orte_proc_t *child,
|
||||
if (NULL == str) {
|
||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||
true,
|
||||
orte_process_info.nodename, app,
|
||||
orte_process_info.nodename, cd->app->app,
|
||||
"opal_fd_read", __FILE__, __LINE__);
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_UNDEF;
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
@ -536,9 +527,9 @@ static int do_parent(orte_proc_t *child,
|
||||
closed, indicating that the child launched
|
||||
successfully). */
|
||||
if (msg.fatal) {
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
close(read_fd);
|
||||
return ORTE_ERR_FAILED_TO_START;
|
||||
@ -548,9 +539,9 @@ static int do_parent(orte_proc_t *child,
|
||||
/* If we got here, it means that the pipe closed without
|
||||
indication of a fatal error, meaning that the child process
|
||||
launched successfully. */
|
||||
if (NULL != child) {
|
||||
child->state = ORTE_PROC_STATE_RUNNING;
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
if (NULL != cd->child) {
|
||||
cd->child->state = ORTE_PROC_STATE_RUNNING;
|
||||
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
close(read_fd);
|
||||
|
||||
@ -561,15 +552,12 @@ static int do_parent(orte_proc_t *child,
|
||||
/**
|
||||
* Fork/exec the specified processes
|
||||
*/
|
||||
static int odls_default_fork_local_proc(orte_proc_t *child,
|
||||
char *app,
|
||||
char **argv,
|
||||
char **environ_copy,
|
||||
orte_job_t *jobdat,
|
||||
orte_iof_base_io_conf_t opts)
|
||||
static int odls_default_fork_local_proc(void *cdptr)
|
||||
{
|
||||
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
|
||||
int p[2];
|
||||
pid_t pid;
|
||||
orte_proc_t *child = cd->child;
|
||||
|
||||
/* A pipe is used to communicate between the parent and child to
|
||||
indicate whether the exec ultimately succeeded or failed. The
|
||||
@ -605,12 +593,12 @@ static int odls_default_fork_local_proc(orte_proc_t *child,
|
||||
|
||||
if (pid == 0) {
|
||||
close(p[0]);
|
||||
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
|
||||
do_child(cd, p[1]);
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
close(p[1]);
|
||||
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
|
||||
return do_parent(cd, p[0]);
|
||||
}
|
||||
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user