1
1

Merge pull request #3217 from rhc54/topic/wdirs

Resolve a race condition for setting our working directory when fork/exec'ing application procs.
Этот коммит содержится в:
Ralph Castain 2017-03-21 17:39:54 -07:00 коммит произвёл GitHub
родитель 09a7b0ffad 74fd2c30af
Коммит 10d401b6ec
5 изменённых файлов: 227 добавлений и 209 удалений

Просмотреть файл

@ -144,11 +144,7 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
static void send_error_show_help(int fd, int exit_status,
const char *file, const char *topic, ...)
__opal_attribute_noreturn__;
static int do_child(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
__opal_attribute_noreturn__;
@ -342,20 +338,15 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
return ORTE_SUCCESS;
}
static int do_child( orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{
int i, rc;
int i;
sigset_t sigs;
char *param, *msg;
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
if (NULL != child) {
if (NULL != cd->child) {
/* setup stdout/stderr so that any error messages that we
may print out will get displayed back at orterun.
@ -369,20 +360,19 @@ static int do_child( orte_proc_t *child,
always outputs a nice, single message indicating what
happened
*/
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-alps.txt",
"iof setup failed",
orte_process_info.nodename, app);
orte_process_info.nodename, cd->app->app);
/* Does not return */
}
/* now set any child-level controls such as binding */
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
@ -393,24 +383,24 @@ static int do_child( orte_proc_t *child,
close(fdnull);
}
fdnull = open("/dev/null", O_RDONLY, 0);
if (fdnull > opts.p_internal[1]) {
dup2(fdnull, opts.p_internal[1]);
if (fdnull > cd->opts.p_internal[1]) {
dup2(fdnull, cd->opts.p_internal[1]);
}
close(fdnull);
}
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
"close fds",
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
__FILE__, __LINE__);
}
if (argv == NULL) {
argv = malloc(sizeof(char*)*2);
argv[0] = strdup(app);
argv[1] = NULL;
if (cd->argv == NULL) {
cd->argv = malloc(sizeof(char*)*2);
cd->argv[0] = strdup(cd->app->app);
cd->argv[1] = NULL;
}
/* Set signal handlers back to the default. Do this close to
@ -437,37 +427,33 @@ static int do_child( orte_proc_t *child,
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
int jout;
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
for (jout=0; NULL != argv[jout]; jout++) {
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
for (jout=0; NULL != cd->argv[jout]; jout++) {
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
}
for (jout=0; NULL != environ_copy[jout]; jout++) {
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
for (jout=0; NULL != cd->env[jout]; jout++) {
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
}
}
execve(app, argv, environ_copy);
execve(cd->app->app, cd->argv, cd->env);
send_error_show_help(write_fd, 1,
"help-orte-odls-alps.txt", "execve error",
orte_process_info.nodename, app, strerror(errno));
orte_process_info.nodename, cd->app->app, strerror(errno));
/* Does not return */
}
static int do_parent(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int read_fd,
orte_iof_base_io_conf_t opts)
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
{
int rc;
orte_odls_pipe_err_msg_t msg;
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
close(opts.p_stdin[0]);
close(opts.p_stdout[1]);
close(opts.p_stderr[1]);
close(opts.p_internal[1]);
close(cd->opts.p_stdin[0]);
close(cd->opts.p_stdout[1]);
close(cd->opts.p_stderr[1]);
close(cd->opts.p_internal[1]);
/* Block reading a message from the pipe */
while (1) {
@ -483,18 +469,18 @@ static int do_parent(orte_proc_t *child,
ORTE_ERROR_LOG(rc);
close(read_fd);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
/* Otherwise, we got a warning or error message from the child */
if (NULL != child) {
if (NULL != cd->child) {
if (msg.fatal) {
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
} else {
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
}
@ -504,10 +490,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -518,10 +504,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -532,10 +518,10 @@ static int do_parent(orte_proc_t *child,
if (NULL == str) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -556,9 +542,9 @@ static int do_parent(orte_proc_t *child,
closed, indicating that the child launched
successfully). */
if (msg.fatal) {
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
close(read_fd);
return ORTE_ERR_FAILED_TO_START;
@ -568,9 +554,9 @@ static int do_parent(orte_proc_t *child,
/* If we got here, it means that the pipe closed without
indication of a fatal error, meaning that the child process
launched successfully. */
if (NULL != child) {
child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
close(read_fd);
@ -581,14 +567,10 @@ static int do_parent(orte_proc_t *child,
/**
* Fork/exec the specified processes
*/
static int odls_alps_fork_local_proc(orte_proc_t *child,
char *app,
char **argv,
char **environ_copy,
orte_job_t *jobdat,
orte_iof_base_io_conf_t opts)
static int odls_alps_fork_local_proc(void *cdptr)
{
int rc, p[2];
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
int p[2];
pid_t pid;
/* A pipe is used to communicate between the parent and child to
@ -601,24 +583,24 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
the pipe, then the child was letting us know why it failed. */
if (pipe(p) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
}
return ORTE_ERR_SYS_LIMITS_PIPES;
}
/* Fork off the child */
pid = fork();
if (NULL != child) {
child->pid = pid;
if (NULL != cd->child) {
cd->child->pid = pid;
}
if (pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
}
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
@ -628,12 +610,12 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
#if HAVE_SETPGID
setpgid(0, 0);
#endif
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
do_child(cd, p[1]);
/* Does not return */
}
close(p[1]);
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
return do_parent(cd, p[0]);
}
@ -643,8 +625,8 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
{
int rc;
orte_jobid_t job;
int rc;
/* construct the list of children we are to launch */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
@ -729,4 +711,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
}
return rc;
}

Просмотреть файл

@ -507,7 +507,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
return rc;
}
static int setup_path(orte_app_context_t *app)
static int setup_path(orte_app_context_t *app, char **wdir)
{
int rc;
char dir[MAXPATHLEN];
@ -539,9 +539,12 @@ static int setup_path(orte_app_context_t *app)
* ensuring they start out matching.
*/
getcwd(dir, sizeof(dir));
*wdir = strdup(dir);
opal_setenv("PWD", dir, true, &app->env);
/* update the initial wdir value too */
opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
} else {
*wdir = NULL;
}
/* Search for the OMPI_exec_path and PATH settings in the environment. */
@ -631,13 +634,12 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
orte_job_t *jobdat = cd->jdata;
orte_app_context_t *app = cd->app;
orte_proc_t *child = cd->child;
char **env = NULL, **argv = NULL, *cmd = NULL;
int rc, i;
bool found;
orte_proc_state_t state;
/* thread-protect common values */
env = opal_argv_copy(app->env);
cd->env = opal_argv_copy(app->env);
/* ensure we clear any prior info regarding state or exit status in
* case this is a restart
@ -646,7 +648,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
/* setup the pmix environment */
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
ORTE_ERROR_LOG(rc);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
@ -680,16 +682,16 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
child->name.vpid == nm->name.vpid) {
/* we want this one - modify the app's command to include
* the orte xterm cmd that starts with the xtermcmd */
argv = opal_argv_copy(orte_odls_globals.xtermcmd);
cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
/* insert the rank into the correct place as a window title */
free(argv[2]);
asprintf(&argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
free(cd->argv[2]);
asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
/* add in the argv from the app */
for (i=0; NULL != app->argv[i]; i++) {
opal_argv_append_nosize(&argv, app->argv[i]);
opal_argv_append_nosize(&cd->argv, app->argv[i]);
}
/* use the xterm cmd as the app string */
cmd = strdup(orte_odls_globals.xtermcmd[0]);
cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
found = true;
break;
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
@ -703,21 +705,21 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
}
}
if (!found) {
cmd = strdup(app->app);
argv = opal_argv_copy(app->argv);
cd->cmd = strdup(app->app);
cd->argv = opal_argv_copy(app->argv);
}
} else if (NULL != orte_fork_agent) {
/* we were given a fork agent - use it */
argv = opal_argv_copy(orte_fork_agent);
cd->argv = opal_argv_copy(orte_fork_agent);
/* add in the argv from the app */
for (i=0; NULL != app->argv[i]; i++) {
opal_argv_append_nosize(&argv, app->argv[i]);
opal_argv_append_nosize(&cd->argv, app->argv[i]);
}
/* the app exe name itself is in the argvsav array, so
* we can recover it from there later
*/
cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
if (NULL == cmd) {
cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
if (NULL == cd->cmd) {
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:fork-agent-not-found",
true, orte_process_info.nodename, orte_fork_agent[0]);
@ -725,14 +727,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
goto errorout;
}
} else {
cmd = strdup(app->app);
argv = opal_argv_copy(app->argv);
cd->cmd = strdup(app->app);
cd->argv = opal_argv_copy(app->argv);
}
/* setup the rest of the environment with the proc-specific items - these
* will be overwritten for each child
*/
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
ORTE_ERROR_LOG(rc);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
@ -741,9 +743,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
/* if we are indexing the argv by rank, do so now */
if (cd->index_argv) {
char *param;
asprintf(&param, "%s-%d", argv[0], (int)child->name.vpid);
free(argv[0]);
argv[0] = param;
asprintf(&param, "%s-%d", cd->argv[0], (int)child->name.vpid);
free(cd->argv[0]);
cd->argv[0] = param;
}
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
@ -757,37 +759,19 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
}
}
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
/* error message already output */
state = ORTE_PROC_STATE_FAILED_TO_START;
goto errorout;
}
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
if (NULL != env) {
opal_argv_free(env);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != cmd) {
free(cmd);
}
OBJ_RELEASE(cd);
return;
errorout:
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
if (NULL != env) {
opal_argv_free(env);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != cmd) {
free(cmd);
}
OBJ_RELEASE(cd);
}
@ -807,6 +791,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
char *msg;
orte_odls_spawn_caddy_t *cd;
opal_event_base_t *evb;
char *effective_dir = NULL;
opal_output_verbose(5, orte_odls_base_framework.framework_output,
"%s local:launch",
@ -945,7 +930,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
/* setup the working directory for this app - will jump us
* to that directory
*/
if (ORTE_SUCCESS != (rc = setup_path(app))) {
if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:launch:setup_path failed with error %s(%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1009,6 +994,15 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
goto GETOUT;
}
/* reset our working directory back to our default location - if we
* don't do this, then we will be looking for relative paths starting
* from the last wdir option specified by the user. Thus, we would
* be requiring that the user keep track on the cmd line of where
* each app was located relative to the prior app, instead of relative
* to their current location
*/
chdir(basedir);
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
for (idx=0; idx < orte_local_children->size; idx++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
@ -1066,6 +1060,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
/* dispatch this child to the next available launch thread */
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
if (NULL != effective_dir) {
cd->wdir = strdup(effective_dir);
}
cd->jdata = jobdat;
cd->app = app;
cd->child = child;
@ -1114,14 +1111,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
}
/* reset our working directory back to our default location - if we
* don't do this, then we will be looking for relative paths starting
* from the last wdir option specified by the user. Thus, we would
* be requiring that the user keep track on the cmd line of where
* each app was located relative to the prior app, instead of relative
* to their current location
*/
chdir(basedir);
if (NULL != effective_dir) {
free(effective_dir);
}
}
GETOUT:
@ -1682,7 +1674,9 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
orte_app_context_t *app;
orte_job_t *jobdat;
char basedir[MAXPATHLEN];
orte_iof_base_io_conf_t opts;
char *wdir = NULL;
orte_odls_spawn_caddy_t *cd;
opal_event_base_t *evb;
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:restart_proc for proc %s",
@ -1720,29 +1714,65 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
}
/* setup the path */
if (ORTE_SUCCESS != (rc = setup_path(app))) {
if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* dispatch this child to the next available launch thread */
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
if (NULL != wdir) {
cd->wdir = strdup(wdir);
free(wdir);
}
cd->jdata = jobdat;
cd->app = app;
cd->child = child;
cd->fork_local = fork_local;
/* setup any IOF */
memset(&opts, 0, sizeof(orte_iof_base_io_conf_t));
cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
/* do we want to setup stdin? */
if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
child->name.vpid == jobdat->stdin_target) {
cd->opts.connect_stdin = true;
} else {
cd->opts.connect_stdin = false;
}
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
ORTE_ERROR_LOG(rc);
child->exit_code = rc;
OBJ_RELEASE(cd);
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
goto CLEANUP;
}
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
/* connect endpoints IOF */
rc = orte_iof_base_setup_parent(&child->name, &opts);
rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
OBJ_RELEASE(cd);
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
goto CLEANUP;
}
}
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
++orte_odls_globals.next_base;
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
orte_odls_globals.next_base = 0;
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s restarting app %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
if (ORTE_SUCCESS != (rc = fork_local(child, app->app, app->argv, app->env, jobdat, opts))) {
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
opal_event_set(evb, &cd->ev, -1,
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
if (ORTE_SUCCESS != (rc = fork_local(cd))) {
orte_wait_cb_cancel(child);
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);

Просмотреть файл

@ -239,7 +239,26 @@ OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
static void sccon(orte_odls_spawn_caddy_t *p)
{
memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
p->cmd = NULL;
p->wdir = NULL;
p->argv = NULL;
p->env = NULL;
}
static void scdes(orte_odls_spawn_caddy_t *p)
{
if (NULL != p->cmd) {
free(p->cmd);
}
if (NULL != p->wdir) {
free(p->wdir);
}
if (NULL != p->argv) {
opal_argv_free(p->argv);
}
if (NULL != p->env) {
opal_argv_free(p->env);
}
}
OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
opal_object_t,
sccon, NULL);
sccon, scdes);

Просмотреть файл

@ -82,16 +82,16 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_DECLSPEC void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata);
/* define a function that will fork a local proc */
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jdata,
orte_iof_base_io_conf_t opts);
typedef int (*orte_odls_base_fork_local_proc_fn_t)(void *cd);
/* define an object for fork/exec the local proc */
typedef struct {
opal_object_t super;
opal_event_t ev;
char *cmd;
char *wdir;
char **argv;
char **env;
orte_job_t *jdata;
orte_app_context_t *app;
orte_proc_t *child;

Просмотреть файл

@ -145,11 +145,7 @@ static void send_error_show_help(int fd, int exit_status,
const char *file, const char *topic, ...)
__opal_attribute_noreturn__;
static int do_child(orte_proc_t *child,
char *cmd, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
__opal_attribute_noreturn__;
@ -319,11 +315,7 @@ static int close_open_file_descriptors(int write_fd,
return ORTE_SUCCESS;
}
static int do_child(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{
int i;
sigset_t sigs;
@ -339,7 +331,7 @@ static int do_child(orte_proc_t *child,
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
if (NULL != child) {
if (NULL != cd->child) {
/* setup stdout/stderr so that any error messages that we
may print out will get displayed back at orterun.
@ -353,22 +345,21 @@ static int do_child(orte_proc_t *child,
always outputs a nice, single message indicating what
happened
*/
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt",
"iof setup failed",
orte_process_info.nodename, app);
orte_process_info.nodename, cd->app->app);
/* Does not return */
}
}
/* now set any child-level controls such as binding */
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
@ -379,8 +370,8 @@ static int do_child(orte_proc_t *child,
close(fdnull);
}
fdnull = open("/dev/null", O_RDONLY, 0);
if (fdnull > opts.p_internal[1]) {
dup2(fdnull, opts.p_internal[1]);
if (fdnull > cd->opts.p_internal[1]) {
dup2(fdnull, cd->opts.p_internal[1]);
}
close(fdnull);
}
@ -388,19 +379,19 @@ static int do_child(orte_proc_t *child,
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
the pipe used for the IOF INTERNAL messages, and the pipe up to
the parent. */
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
// close *all* file descriptors -- slow
for(fd=3; fd<fdmax; fd++) {
if (fd != opts.p_internal[1] && fd != write_fd) {
if (fd != cd->opts.p_internal[1] && fd != write_fd) {
close(fd);
}
}
}
if (argv == NULL) {
argv = malloc(sizeof(char*)*2);
argv[0] = strdup(app);
argv[1] = NULL;
if (cd->argv == NULL) {
cd->argv = malloc(sizeof(char*)*2);
cd->argv[0] = strdup(cd->app->app);
cd->argv[1] = NULL;
}
/* Set signal handlers back to the default. Do this close to
@ -423,31 +414,31 @@ static int do_child(orte_proc_t *child,
sigprocmask(0, 0, &sigs);
sigprocmask(SIG_UNBLOCK, &sigs, 0);
/* Exec the new executable */
/* take us to the correct wdir */
if (NULL != cd->wdir) {
chdir(cd->wdir);
}
execve(app, argv, environ_copy);
/* Exec the new executable */
execve(cd->app->app, cd->argv, cd->env);
getcwd(dir, sizeof(dir));
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt", "execve error",
orte_process_info.nodename, dir, app, strerror(errno));
orte_process_info.nodename, dir, cd->app->app, strerror(errno));
/* Does not return */
}
static int do_parent(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int read_fd,
orte_iof_base_io_conf_t opts)
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
{
int rc;
orte_odls_pipe_err_msg_t msg;
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
close(opts.p_stdin[0]);
close(opts.p_stdout[1]);
close(opts.p_stderr[1]);
close(opts.p_internal[1]);
close(cd->opts.p_stdin[0]);
close(cd->opts.p_stdout[1]);
close(cd->opts.p_stderr[1]);
close(cd->opts.p_internal[1]);
/* Block reading a message from the pipe */
while (1) {
@ -463,18 +454,18 @@ static int do_parent(orte_proc_t *child,
ORTE_ERROR_LOG(rc);
close(read_fd);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
/* Otherwise, we got a warning or error message from the child */
if (NULL != child) {
if (NULL != cd->child) {
if (msg.fatal) {
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
} else {
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
}
@ -484,10 +475,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-default.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -498,10 +489,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-default.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -512,10 +503,10 @@ static int do_parent(orte_proc_t *child,
if (NULL == str) {
orte_show_help("help-orte-odls-default.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -536,9 +527,9 @@ static int do_parent(orte_proc_t *child,
closed, indicating that the child launched
successfully). */
if (msg.fatal) {
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
close(read_fd);
return ORTE_ERR_FAILED_TO_START;
@ -548,9 +539,9 @@ static int do_parent(orte_proc_t *child,
/* If we got here, it means that the pipe closed without
indication of a fatal error, meaning that the child process
launched successfully. */
if (NULL != child) {
child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
close(read_fd);
@ -561,15 +552,12 @@ static int do_parent(orte_proc_t *child,
/**
* Fork/exec the specified processes
*/
static int odls_default_fork_local_proc(orte_proc_t *child,
char *app,
char **argv,
char **environ_copy,
orte_job_t *jobdat,
orte_iof_base_io_conf_t opts)
static int odls_default_fork_local_proc(void *cdptr)
{
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
int p[2];
pid_t pid;
orte_proc_t *child = cd->child;
/* A pipe is used to communicate between the parent and child to
indicate whether the exec ultimately succeeded or failed. The
@ -605,12 +593,12 @@ static int odls_default_fork_local_proc(orte_proc_t *child,
if (pid == 0) {
close(p[0]);
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
do_child(cd, p[1]);
/* Does not return */
}
close(p[1]);
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
return do_parent(cd, p[0]);
}