1
1

Resolve a race condition for setting our working directory when fork/exec'ing application procs. We have to ensure we do it after the fork occurs since we want to use multiple threads in the odls. Otherwise, the different threads are bouncing the entire process around.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-03-21 13:54:03 -07:00
родитель 20bf0dd7c6
Коммит 75684dc260
5 изменённых файлов: 186 добавлений и 155 удалений

Просмотреть файл

@ -342,11 +342,7 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
return ORTE_SUCCESS;
}
static int do_child( orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{
int i, rc;
sigset_t sigs;
@ -355,7 +351,7 @@ static int do_child( orte_proc_t *child,
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
if (NULL != child) {
if (NULL != cd->child) {
/* setup stdout/stderr so that any error messages that we
may print out will get displayed back at orterun.
@ -369,20 +365,19 @@ static int do_child( orte_proc_t *child,
always outputs a nice, single message indicating what
happened
*/
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-alps.txt",
"iof setup failed",
orte_process_info.nodename, app);
orte_process_info.nodename, cd->app->app);
/* Does not return */
}
/* now set any child-level controls such as binding */
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
@ -393,24 +388,24 @@ static int do_child( orte_proc_t *child,
close(fdnull);
}
fdnull = open("/dev/null", O_RDONLY, 0);
if (fdnull > opts.p_internal[1]) {
dup2(fdnull, opts.p_internal[1]);
if (fdnull > cd->opts.p_internal[1]) {
dup2(fdnull, cd->opts.p_internal[1]);
}
close(fdnull);
}
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
"close fds",
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
__FILE__, __LINE__);
}
if (argv == NULL) {
argv = malloc(sizeof(char*)*2);
argv[0] = strdup(app);
argv[1] = NULL;
if (cd->argv == NULL) {
cd->argv = malloc(sizeof(char*)*2);
cd->argv[0] = strdup(cd->app->app);
cd->argv[1] = NULL;
}
/* Set signal handlers back to the default. Do this close to
@ -437,19 +432,19 @@ static int do_child( orte_proc_t *child,
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
int jout;
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
for (jout=0; NULL != argv[jout]; jout++) {
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
for (jout=0; NULL != cd->argv[jout]; jout++) {
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
}
for (jout=0; NULL != environ_copy[jout]; jout++) {
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
for (jout=0; NULL != cd->env[jout]; jout++) {
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
}
}
execve(app, argv, environ_copy);
execve(cd->app->app, cd->argv, cd->env);
send_error_show_help(write_fd, 1,
"help-orte-odls-alps.txt", "execve error",
orte_process_info.nodename, app, strerror(errno));
orte_process_info.nodename, cd->app->app, strerror(errno));
/* Does not return */
}
@ -729,4 +724,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
}
return rc;
}

Просмотреть файл

@ -507,7 +507,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
return rc;
}
static int setup_path(orte_app_context_t *app)
static int setup_path(orte_app_context_t *app, char **wdir)
{
int rc;
char dir[MAXPATHLEN];
@ -539,9 +539,12 @@ static int setup_path(orte_app_context_t *app)
* ensuring they start out matching.
*/
getcwd(dir, sizeof(dir));
*wdir = strdup(dir);
opal_setenv("PWD", dir, true, &app->env);
/* update the initial wdir value too */
opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
} else {
*wdir = NULL;
}
/* Search for the OMPI_exec_path and PATH settings in the environment. */
@ -631,13 +634,12 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
orte_job_t *jobdat = cd->jdata;
orte_app_context_t *app = cd->app;
orte_proc_t *child = cd->child;
char **env = NULL, **argv = NULL, *cmd = NULL;
int rc, i;
bool found;
orte_proc_state_t state;
/* thread-protect common values */
env = opal_argv_copy(app->env);
cd->env = opal_argv_copy(app->env);
/* ensure we clear any prior info regarding state or exit status in
* case this is a restart
@ -646,7 +648,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
/* setup the pmix environment */
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
ORTE_ERROR_LOG(rc);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
@ -680,16 +682,16 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
child->name.vpid == nm->name.vpid) {
/* we want this one - modify the app's command to include
* the orte xterm cmd that starts with the xtermcmd */
argv = opal_argv_copy(orte_odls_globals.xtermcmd);
cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
/* insert the rank into the correct place as a window title */
free(argv[2]);
asprintf(&argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
free(cd->argv[2]);
asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
/* add in the argv from the app */
for (i=0; NULL != app->argv[i]; i++) {
opal_argv_append_nosize(&argv, app->argv[i]);
opal_argv_append_nosize(&cd->argv, app->argv[i]);
}
/* use the xterm cmd as the app string */
cmd = strdup(orte_odls_globals.xtermcmd[0]);
cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
found = true;
break;
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
@ -703,21 +705,21 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
}
}
if (!found) {
cmd = strdup(app->app);
argv = opal_argv_copy(app->argv);
cd->cmd = strdup(app->app);
cd->argv = opal_argv_copy(app->argv);
}
} else if (NULL != orte_fork_agent) {
/* we were given a fork agent - use it */
argv = opal_argv_copy(orte_fork_agent);
cd->argv = opal_argv_copy(orte_fork_agent);
/* add in the argv from the app */
for (i=0; NULL != app->argv[i]; i++) {
opal_argv_append_nosize(&argv, app->argv[i]);
opal_argv_append_nosize(&cd->argv, app->argv[i]);
}
/* the app exe name itself is in the argvsav array, so
* we can recover it from there later
*/
cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
if (NULL == cmd) {
cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
if (NULL == cd->cmd) {
orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:fork-agent-not-found",
true, orte_process_info.nodename, orte_fork_agent[0]);
@ -725,14 +727,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
goto errorout;
}
} else {
cmd = strdup(app->app);
argv = opal_argv_copy(app->argv);
cd->cmd = strdup(app->app);
cd->argv = opal_argv_copy(app->argv);
}
/* setup the rest of the environment with the proc-specific items - these
* will be overwritten for each child
*/
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
ORTE_ERROR_LOG(rc);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
@ -741,9 +743,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
/* if we are indexing the argv by rank, do so now */
if (cd->index_argv) {
char *param;
asprintf(&param, "%s-%d", argv[0], (int)child->name.vpid);
free(argv[0]);
argv[0] = param;
asprintf(&param, "%s-%d", cd->argv[0], (int)child->name.vpid);
free(cd->argv[0]);
cd->argv[0] = param;
}
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
@ -757,37 +759,19 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
}
}
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
/* error message already output */
state = ORTE_PROC_STATE_FAILED_TO_START;
goto errorout;
}
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
if (NULL != env) {
opal_argv_free(env);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != cmd) {
free(cmd);
}
OBJ_RELEASE(cd);
return;
errorout:
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
if (NULL != env) {
opal_argv_free(env);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != cmd) {
free(cmd);
}
OBJ_RELEASE(cd);
}
@ -807,6 +791,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
char *msg;
orte_odls_spawn_caddy_t *cd;
opal_event_base_t *evb;
char *effective_dir = NULL;
opal_output_verbose(5, orte_odls_base_framework.framework_output,
"%s local:launch",
@ -945,7 +930,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
/* setup the working directory for this app - will jump us
* to that directory
*/
if (ORTE_SUCCESS != (rc = setup_path(app))) {
if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:launch:setup_path failed with error %s(%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1009,6 +994,15 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
goto GETOUT;
}
/* reset our working directory back to our default location - if we
* don't do this, then we will be looking for relative paths starting
* from the last wdir option specified by the user. Thus, we would
* be requiring that the user keep track on the cmd line of where
* each app was located relative to the prior app, instead of relative
* to their current location
*/
chdir(basedir);
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
for (idx=0; idx < orte_local_children->size; idx++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
@ -1066,6 +1060,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
/* dispatch this child to the next available launch thread */
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
if (NULL != effective_dir) {
cd->wdir = strdup(effective_dir);
}
cd->jdata = jobdat;
cd->app = app;
cd->child = child;
@ -1114,14 +1111,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
}
/* reset our working directory back to our default location - if we
* don't do this, then we will be looking for relative paths starting
* from the last wdir option specified by the user. Thus, we would
* be requiring that the user keep track on the cmd line of where
* each app was located relative to the prior app, instead of relative
* to their current location
*/
chdir(basedir);
if (NULL != effective_dir) {
free(effective_dir);
}
}
GETOUT:
@ -1682,7 +1674,9 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
orte_app_context_t *app;
orte_job_t *jobdat;
char basedir[MAXPATHLEN];
orte_iof_base_io_conf_t opts;
char *wdir = NULL;
orte_odls_spawn_caddy_t *cd;
opal_event_base_t *evb;
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:restart_proc for proc %s",
@ -1720,35 +1714,71 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
}
/* setup the path */
if (ORTE_SUCCESS != (rc = setup_path(app))) {
if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* dispatch this child to the next available launch thread */
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
if (NULL != wdir) {
cd->wdir = strdup(wdir);
free(wdir);
}
cd->jdata = jobdat;
cd->app = app;
cd->child = child;
cd->fork_local = fork_local;
/* setup any IOF */
memset(&opts, 0, sizeof(orte_iof_base_io_conf_t));
cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
/* do we want to setup stdin? */
if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
child->name.vpid == jobdat->stdin_target) {
cd->opts.connect_stdin = true;
} else {
cd->opts.connect_stdin = false;
}
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
ORTE_ERROR_LOG(rc);
child->exit_code = rc;
OBJ_RELEASE(cd);
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
goto CLEANUP;
}
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
/* connect endpoints IOF */
rc = orte_iof_base_setup_parent(&child->name, &opts);
rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
OBJ_RELEASE(cd);
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
goto CLEANUP;
}
}
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
++orte_odls_globals.next_base;
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
orte_odls_globals.next_base = 0;
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s restarting app %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
if (ORTE_SUCCESS != (rc = fork_local(child, app->app, app->argv, app->env, jobdat, opts))) {
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
opal_event_set(evb, &cd->ev, -1,
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
if (ORTE_SUCCESS != (rc = fork_local(cd))) {
orte_wait_cb_cancel(child);
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
}
CLEANUP:
CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:restart of proc %s %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -239,7 +239,26 @@ OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
static void sccon(orte_odls_spawn_caddy_t *p)
{
memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
p->cmd = NULL;
p->wdir = NULL;
p->argv = NULL;
p->env = NULL;
}
static void scdes(orte_odls_spawn_caddy_t *p)
{
if (NULL != p->cmd) {
free(p->cmd);
}
if (NULL != p->wdir) {
free(p->wdir);
}
if (NULL != p->argv) {
opal_argv_free(p->argv);
}
if (NULL != p->env) {
opal_argv_free(p->env);
}
}
OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
opal_object_t,
sccon, NULL);
sccon, scdes);

Просмотреть файл

@ -82,16 +82,16 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_DECLSPEC void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata);
/* define a function that will fork a local proc */
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jdata,
orte_iof_base_io_conf_t opts);
typedef int (*orte_odls_base_fork_local_proc_fn_t)(void *cd);
/* define an object for fork/exec the local proc */
typedef struct {
opal_object_t super;
opal_event_t ev;
char *cmd;
char *wdir;
char **argv;
char **env;
orte_job_t *jdata;
orte_app_context_t *app;
orte_proc_t *child;

Просмотреть файл

@ -145,11 +145,7 @@ static void send_error_show_help(int fd, int exit_status,
const char *file, const char *topic, ...)
__opal_attribute_noreturn__;
static int do_child(orte_proc_t *child,
char *cmd, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
__opal_attribute_noreturn__;
@ -319,11 +315,7 @@ static int close_open_file_descriptors(int write_fd,
return ORTE_SUCCESS;
}
static int do_child(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{
int i;
sigset_t sigs;
@ -339,7 +331,7 @@ static int do_child(orte_proc_t *child,
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
if (NULL != child) {
if (NULL != cd->child) {
/* setup stdout/stderr so that any error messages that we
may print out will get displayed back at orterun.
@ -353,22 +345,21 @@ static int do_child(orte_proc_t *child,
always outputs a nice, single message indicating what
happened
*/
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
&environ_copy))) {
if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
ORTE_ERROR_LOG(i);
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt",
"iof setup failed",
orte_process_info.nodename, app);
orte_process_info.nodename, cd->app->app);
/* Does not return */
}
}
/* now set any child-level controls such as binding */
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
@ -379,8 +370,8 @@ static int do_child(orte_proc_t *child,
close(fdnull);
}
fdnull = open("/dev/null", O_RDONLY, 0);
if (fdnull > opts.p_internal[1]) {
dup2(fdnull, opts.p_internal[1]);
if (fdnull > cd->opts.p_internal[1]) {
dup2(fdnull, cd->opts.p_internal[1]);
}
close(fdnull);
}
@ -388,19 +379,19 @@ static int do_child(orte_proc_t *child,
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
the pipe used for the IOF INTERNAL messages, and the pipe up to
the parent. */
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
// close *all* file descriptors -- slow
for(fd=3; fd<fdmax; fd++) {
if (fd != opts.p_internal[1] && fd != write_fd) {
if (fd != cd->opts.p_internal[1] && fd != write_fd) {
close(fd);
}
}
}
if (argv == NULL) {
argv = malloc(sizeof(char*)*2);
argv[0] = strdup(app);
argv[1] = NULL;
if (cd->argv == NULL) {
cd->argv = malloc(sizeof(char*)*2);
cd->argv[0] = strdup(cd->app->app);
cd->argv[1] = NULL;
}
/* Set signal handlers back to the default. Do this close to
@ -423,31 +414,31 @@ static int do_child(orte_proc_t *child,
sigprocmask(0, 0, &sigs);
sigprocmask(SIG_UNBLOCK, &sigs, 0);
/* Exec the new executable */
/* take us to the correct wdir */
if (NULL != cd->wdir) {
chdir(cd->wdir);
}
execve(app, argv, environ_copy);
/* Exec the new executable */
execve(cd->app->app, cd->argv, cd->env);
getcwd(dir, sizeof(dir));
send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt", "execve error",
orte_process_info.nodename, dir, app, strerror(errno));
orte_process_info.nodename, dir, cd->app->app, strerror(errno));
/* Does not return */
}
static int do_parent(orte_proc_t *child,
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int read_fd,
orte_iof_base_io_conf_t opts)
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
{
int rc;
orte_odls_pipe_err_msg_t msg;
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
close(opts.p_stdin[0]);
close(opts.p_stdout[1]);
close(opts.p_stderr[1]);
close(opts.p_internal[1]);
close(cd->opts.p_stdin[0]);
close(cd->opts.p_stdout[1]);
close(cd->opts.p_stderr[1]);
close(cd->opts.p_internal[1]);
/* Block reading a message from the pipe */
while (1) {
@ -463,18 +454,18 @@ static int do_parent(orte_proc_t *child,
ORTE_ERROR_LOG(rc);
close(read_fd);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
/* Otherwise, we got a warning or error message from the child */
if (NULL != child) {
if (NULL != cd->child) {
if (msg.fatal) {
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
} else {
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
}
@ -484,10 +475,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-default.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -498,10 +489,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-default.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -512,10 +503,10 @@ static int do_parent(orte_proc_t *child,
if (NULL == str) {
orte_show_help("help-orte-odls-default.txt", "syscall fail",
true,
orte_process_info.nodename, app,
orte_process_info.nodename, cd->app->app,
"opal_fd_read", __FILE__, __LINE__);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
@ -536,9 +527,9 @@ static int do_parent(orte_proc_t *child,
closed, indicating that the child launched
successfully). */
if (msg.fatal) {
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
close(read_fd);
return ORTE_ERR_FAILED_TO_START;
@ -548,9 +539,9 @@ static int do_parent(orte_proc_t *child,
/* If we got here, it means that the pipe closed without
indication of a fatal error, meaning that the child process
launched successfully. */
if (NULL != child) {
child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
if (NULL != cd->child) {
cd->child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
}
close(read_fd);
@ -561,15 +552,12 @@ static int do_parent(orte_proc_t *child,
/**
* Fork/exec the specified processes
*/
static int odls_default_fork_local_proc(orte_proc_t *child,
char *app,
char **argv,
char **environ_copy,
orte_job_t *jobdat,
orte_iof_base_io_conf_t opts)
static int odls_default_fork_local_proc(void *cdptr)
{
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
int p[2];
pid_t pid;
orte_proc_t *child = cd->child;
/* A pipe is used to communicate between the parent and child to
indicate whether the exec ultimately succeeded or failed. The
@ -605,12 +593,12 @@ static int odls_default_fork_local_proc(orte_proc_t *child,
if (pid == 0) {
close(p[0]);
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
do_child(cd, p[1]);
/* Does not return */
}
close(p[1]);
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
return do_parent(cd, p[0]);
}