Merge pull request #3217 from rhc54/topic/wdirs
Resolve a race condition for setting our working directory when fork/exec'ing application procs.
Этот коммит содержится в:
Коммит
10d401b6ec
@ -144,11 +144,7 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
|
|||||||
static void send_error_show_help(int fd, int exit_status,
|
static void send_error_show_help(int fd, int exit_status,
|
||||||
const char *file, const char *topic, ...)
|
const char *file, const char *topic, ...)
|
||||||
__opal_attribute_noreturn__;
|
__opal_attribute_noreturn__;
|
||||||
static int do_child(orte_proc_t *child,
|
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||||
char *app, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat, int write_fd,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
__opal_attribute_noreturn__;
|
__opal_attribute_noreturn__;
|
||||||
|
|
||||||
|
|
||||||
@ -342,20 +338,15 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int do_child( orte_proc_t *child,
|
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||||
char *app, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat, int write_fd,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
{
|
{
|
||||||
int i, rc;
|
int i;
|
||||||
sigset_t sigs;
|
sigset_t sigs;
|
||||||
char *param, *msg;
|
|
||||||
|
|
||||||
/* Setup the pipe to be close-on-exec */
|
/* Setup the pipe to be close-on-exec */
|
||||||
opal_fd_set_cloexec(write_fd);
|
opal_fd_set_cloexec(write_fd);
|
||||||
|
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
/* setup stdout/stderr so that any error messages that we
|
/* setup stdout/stderr so that any error messages that we
|
||||||
may print out will get displayed back at orterun.
|
may print out will get displayed back at orterun.
|
||||||
|
|
||||||
@ -369,20 +360,19 @@ static int do_child( orte_proc_t *child,
|
|||||||
always outputs a nice, single message indicating what
|
always outputs a nice, single message indicating what
|
||||||
happened
|
happened
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
|
||||||
&environ_copy))) {
|
|
||||||
ORTE_ERROR_LOG(i);
|
ORTE_ERROR_LOG(i);
|
||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-alps.txt",
|
"help-orte-odls-alps.txt",
|
||||||
"iof setup failed",
|
"iof setup failed",
|
||||||
orte_process_info.nodename, app);
|
orte_process_info.nodename, cd->app->app);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now set any child-level controls such as binding */
|
/* now set any child-level controls such as binding */
|
||||||
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
|
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
|
||||||
|
|
||||||
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
/* tie stdin/out/err/internal to /dev/null */
|
/* tie stdin/out/err/internal to /dev/null */
|
||||||
int fdnull;
|
int fdnull;
|
||||||
for (i=0; i < 3; i++) {
|
for (i=0; i < 3; i++) {
|
||||||
@ -393,24 +383,24 @@ static int do_child( orte_proc_t *child,
|
|||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||||
if (fdnull > opts.p_internal[1]) {
|
if (fdnull > cd->opts.p_internal[1]) {
|
||||||
dup2(fdnull, opts.p_internal[1]);
|
dup2(fdnull, cd->opts.p_internal[1]);
|
||||||
}
|
}
|
||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
|
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
|
||||||
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
|
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
|
||||||
"close fds",
|
"close fds",
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app->app,
|
||||||
__FILE__, __LINE__);
|
__FILE__, __LINE__);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (argv == NULL) {
|
if (cd->argv == NULL) {
|
||||||
argv = malloc(sizeof(char*)*2);
|
cd->argv = malloc(sizeof(char*)*2);
|
||||||
argv[0] = strdup(app);
|
cd->argv[0] = strdup(cd->app->app);
|
||||||
argv[1] = NULL;
|
cd->argv[1] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set signal handlers back to the default. Do this close to
|
/* Set signal handlers back to the default. Do this close to
|
||||||
@ -437,37 +427,33 @@ static int do_child( orte_proc_t *child,
|
|||||||
|
|
||||||
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||||
int jout;
|
int jout;
|
||||||
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
|
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
|
||||||
for (jout=0; NULL != argv[jout]; jout++) {
|
for (jout=0; NULL != cd->argv[jout]; jout++) {
|
||||||
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
|
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
|
||||||
}
|
}
|
||||||
for (jout=0; NULL != environ_copy[jout]; jout++) {
|
for (jout=0; NULL != cd->env[jout]; jout++) {
|
||||||
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
|
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
execve(app, argv, environ_copy);
|
execve(cd->app->app, cd->argv, cd->env);
|
||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-alps.txt", "execve error",
|
"help-orte-odls-alps.txt", "execve error",
|
||||||
orte_process_info.nodename, app, strerror(errno));
|
orte_process_info.nodename, cd->app->app, strerror(errno));
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int do_parent(orte_proc_t *child,
|
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
|
||||||
char *app, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat, int read_fd,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_odls_pipe_err_msg_t msg;
|
orte_odls_pipe_err_msg_t msg;
|
||||||
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
||||||
|
|
||||||
close(opts.p_stdin[0]);
|
close(cd->opts.p_stdin[0]);
|
||||||
close(opts.p_stdout[1]);
|
close(cd->opts.p_stdout[1]);
|
||||||
close(opts.p_stderr[1]);
|
close(cd->opts.p_stderr[1]);
|
||||||
close(opts.p_internal[1]);
|
close(cd->opts.p_internal[1]);
|
||||||
|
|
||||||
/* Block reading a message from the pipe */
|
/* Block reading a message from the pipe */
|
||||||
while (1) {
|
while (1) {
|
||||||
@ -483,18 +469,18 @@ static int do_parent(orte_proc_t *child,
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
close(read_fd);
|
close(read_fd);
|
||||||
|
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Otherwise, we got a warning or error message from the child */
|
/* Otherwise, we got a warning or error message from the child */
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
if (msg.fatal) {
|
if (msg.fatal) {
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
} else {
|
} else {
|
||||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -504,10 +490,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -518,10 +504,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -532,10 +518,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
if (NULL == str) {
|
if (NULL == str) {
|
||||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -556,9 +542,9 @@ static int do_parent(orte_proc_t *child,
|
|||||||
closed, indicating that the child launched
|
closed, indicating that the child launched
|
||||||
successfully). */
|
successfully). */
|
||||||
if (msg.fatal) {
|
if (msg.fatal) {
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
}
|
}
|
||||||
close(read_fd);
|
close(read_fd);
|
||||||
return ORTE_ERR_FAILED_TO_START;
|
return ORTE_ERR_FAILED_TO_START;
|
||||||
@ -568,9 +554,9 @@ static int do_parent(orte_proc_t *child,
|
|||||||
/* If we got here, it means that the pipe closed without
|
/* If we got here, it means that the pipe closed without
|
||||||
indication of a fatal error, meaning that the child process
|
indication of a fatal error, meaning that the child process
|
||||||
launched successfully. */
|
launched successfully. */
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_RUNNING;
|
cd->child->state = ORTE_PROC_STATE_RUNNING;
|
||||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
}
|
}
|
||||||
close(read_fd);
|
close(read_fd);
|
||||||
|
|
||||||
@ -581,14 +567,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
/**
|
/**
|
||||||
* Fork/exec the specified processes
|
* Fork/exec the specified processes
|
||||||
*/
|
*/
|
||||||
static int odls_alps_fork_local_proc(orte_proc_t *child,
|
static int odls_alps_fork_local_proc(void *cdptr)
|
||||||
char *app,
|
|
||||||
char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
{
|
{
|
||||||
int rc, p[2];
|
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
|
||||||
|
int p[2];
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
|
|
||||||
/* A pipe is used to communicate between the parent and child to
|
/* A pipe is used to communicate between the parent and child to
|
||||||
@ -601,24 +583,24 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
|
|||||||
the pipe, then the child was letting us know why it failed. */
|
the pipe, then the child was letting us know why it failed. */
|
||||||
if (pipe(p) < 0) {
|
if (pipe(p) < 0) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
|
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
|
||||||
}
|
}
|
||||||
return ORTE_ERR_SYS_LIMITS_PIPES;
|
return ORTE_ERR_SYS_LIMITS_PIPES;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Fork off the child */
|
/* Fork off the child */
|
||||||
pid = fork();
|
pid = fork();
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->pid = pid;
|
cd->child->pid = pid;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pid < 0) {
|
if (pid < 0) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||||
}
|
}
|
||||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||||
}
|
}
|
||||||
@ -628,12 +610,12 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
|
|||||||
#if HAVE_SETPGID
|
#if HAVE_SETPGID
|
||||||
setpgid(0, 0);
|
setpgid(0, 0);
|
||||||
#endif
|
#endif
|
||||||
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
|
do_child(cd, p[1]);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
close(p[1]);
|
close(p[1]);
|
||||||
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
|
return do_parent(cd, p[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -643,8 +625,8 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
|
|||||||
|
|
||||||
int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
|
int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
|
||||||
{
|
{
|
||||||
int rc;
|
|
||||||
orte_jobid_t job;
|
orte_jobid_t job;
|
||||||
|
int rc;
|
||||||
|
|
||||||
/* construct the list of children we are to launch */
|
/* construct the list of children we are to launch */
|
||||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
|
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
|
||||||
@ -729,4 +711,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
|
|||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -507,7 +507,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int setup_path(orte_app_context_t *app)
|
static int setup_path(orte_app_context_t *app, char **wdir)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
char dir[MAXPATHLEN];
|
char dir[MAXPATHLEN];
|
||||||
@ -539,9 +539,12 @@ static int setup_path(orte_app_context_t *app)
|
|||||||
* ensuring they start out matching.
|
* ensuring they start out matching.
|
||||||
*/
|
*/
|
||||||
getcwd(dir, sizeof(dir));
|
getcwd(dir, sizeof(dir));
|
||||||
|
*wdir = strdup(dir);
|
||||||
opal_setenv("PWD", dir, true, &app->env);
|
opal_setenv("PWD", dir, true, &app->env);
|
||||||
/* update the initial wdir value too */
|
/* update the initial wdir value too */
|
||||||
opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
|
opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
|
||||||
|
} else {
|
||||||
|
*wdir = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Search for the OMPI_exec_path and PATH settings in the environment. */
|
/* Search for the OMPI_exec_path and PATH settings in the environment. */
|
||||||
@ -631,13 +634,12 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
orte_job_t *jobdat = cd->jdata;
|
orte_job_t *jobdat = cd->jdata;
|
||||||
orte_app_context_t *app = cd->app;
|
orte_app_context_t *app = cd->app;
|
||||||
orte_proc_t *child = cd->child;
|
orte_proc_t *child = cd->child;
|
||||||
char **env = NULL, **argv = NULL, *cmd = NULL;
|
|
||||||
int rc, i;
|
int rc, i;
|
||||||
bool found;
|
bool found;
|
||||||
orte_proc_state_t state;
|
orte_proc_state_t state;
|
||||||
|
|
||||||
/* thread-protect common values */
|
/* thread-protect common values */
|
||||||
env = opal_argv_copy(app->env);
|
cd->env = opal_argv_copy(app->env);
|
||||||
|
|
||||||
/* ensure we clear any prior info regarding state or exit status in
|
/* ensure we clear any prior info regarding state or exit status in
|
||||||
* case this is a restart
|
* case this is a restart
|
||||||
@ -646,7 +648,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
|
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
|
||||||
|
|
||||||
/* setup the pmix environment */
|
/* setup the pmix environment */
|
||||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
|
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||||
goto errorout;
|
goto errorout;
|
||||||
@ -680,16 +682,16 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
child->name.vpid == nm->name.vpid) {
|
child->name.vpid == nm->name.vpid) {
|
||||||
/* we want this one - modify the app's command to include
|
/* we want this one - modify the app's command to include
|
||||||
* the orte xterm cmd that starts with the xtermcmd */
|
* the orte xterm cmd that starts with the xtermcmd */
|
||||||
argv = opal_argv_copy(orte_odls_globals.xtermcmd);
|
cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
|
||||||
/* insert the rank into the correct place as a window title */
|
/* insert the rank into the correct place as a window title */
|
||||||
free(argv[2]);
|
free(cd->argv[2]);
|
||||||
asprintf(&argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
|
asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
|
||||||
/* add in the argv from the app */
|
/* add in the argv from the app */
|
||||||
for (i=0; NULL != app->argv[i]; i++) {
|
for (i=0; NULL != app->argv[i]; i++) {
|
||||||
opal_argv_append_nosize(&argv, app->argv[i]);
|
opal_argv_append_nosize(&cd->argv, app->argv[i]);
|
||||||
}
|
}
|
||||||
/* use the xterm cmd as the app string */
|
/* use the xterm cmd as the app string */
|
||||||
cmd = strdup(orte_odls_globals.xtermcmd[0]);
|
cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
|
||||||
found = true;
|
found = true;
|
||||||
break;
|
break;
|
||||||
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
|
} else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
|
||||||
@ -703,21 +705,21 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!found) {
|
if (!found) {
|
||||||
cmd = strdup(app->app);
|
cd->cmd = strdup(app->app);
|
||||||
argv = opal_argv_copy(app->argv);
|
cd->argv = opal_argv_copy(app->argv);
|
||||||
}
|
}
|
||||||
} else if (NULL != orte_fork_agent) {
|
} else if (NULL != orte_fork_agent) {
|
||||||
/* we were given a fork agent - use it */
|
/* we were given a fork agent - use it */
|
||||||
argv = opal_argv_copy(orte_fork_agent);
|
cd->argv = opal_argv_copy(orte_fork_agent);
|
||||||
/* add in the argv from the app */
|
/* add in the argv from the app */
|
||||||
for (i=0; NULL != app->argv[i]; i++) {
|
for (i=0; NULL != app->argv[i]; i++) {
|
||||||
opal_argv_append_nosize(&argv, app->argv[i]);
|
opal_argv_append_nosize(&cd->argv, app->argv[i]);
|
||||||
}
|
}
|
||||||
/* the app exe name itself is in the argvsav array, so
|
/* the app exe name itself is in the argvsav array, so
|
||||||
* we can recover it from there later
|
* we can recover it from there later
|
||||||
*/
|
*/
|
||||||
cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
|
||||||
if (NULL == cmd) {
|
if (NULL == cd->cmd) {
|
||||||
orte_show_help("help-orte-odls-base.txt",
|
orte_show_help("help-orte-odls-base.txt",
|
||||||
"orte-odls-base:fork-agent-not-found",
|
"orte-odls-base:fork-agent-not-found",
|
||||||
true, orte_process_info.nodename, orte_fork_agent[0]);
|
true, orte_process_info.nodename, orte_fork_agent[0]);
|
||||||
@ -725,14 +727,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
goto errorout;
|
goto errorout;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
cmd = strdup(app->app);
|
cd->cmd = strdup(app->app);
|
||||||
argv = opal_argv_copy(app->argv);
|
cd->argv = opal_argv_copy(app->argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the rest of the environment with the proc-specific items - these
|
/* setup the rest of the environment with the proc-specific items - these
|
||||||
* will be overwritten for each child
|
* will be overwritten for each child
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
|
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||||
goto errorout;
|
goto errorout;
|
||||||
@ -741,9 +743,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
/* if we are indexing the argv by rank, do so now */
|
/* if we are indexing the argv by rank, do so now */
|
||||||
if (cd->index_argv) {
|
if (cd->index_argv) {
|
||||||
char *param;
|
char *param;
|
||||||
asprintf(¶m, "%s-%d", argv[0], (int)child->name.vpid);
|
asprintf(¶m, "%s-%d", cd->argv[0], (int)child->name.vpid);
|
||||||
free(argv[0]);
|
free(cd->argv[0]);
|
||||||
argv[0] = param;
|
cd->argv[0] = param;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||||
@ -757,37 +759,19 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
|
if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
|
||||||
/* error message already output */
|
/* error message already output */
|
||||||
state = ORTE_PROC_STATE_FAILED_TO_START;
|
state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
goto errorout;
|
goto errorout;
|
||||||
}
|
}
|
||||||
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
|
||||||
if (NULL != env) {
|
|
||||||
opal_argv_free(env);
|
|
||||||
}
|
|
||||||
if (NULL != argv) {
|
|
||||||
opal_argv_free(argv);
|
|
||||||
}
|
|
||||||
if (NULL != cmd) {
|
|
||||||
free(cmd);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(cd);
|
OBJ_RELEASE(cd);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
errorout:
|
errorout:
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
|
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
|
||||||
if (NULL != env) {
|
|
||||||
opal_argv_free(env);
|
|
||||||
}
|
|
||||||
if (NULL != argv) {
|
|
||||||
opal_argv_free(argv);
|
|
||||||
}
|
|
||||||
if (NULL != cmd) {
|
|
||||||
free(cmd);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(cd);
|
OBJ_RELEASE(cd);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -807,6 +791,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
char *msg;
|
char *msg;
|
||||||
orte_odls_spawn_caddy_t *cd;
|
orte_odls_spawn_caddy_t *cd;
|
||||||
opal_event_base_t *evb;
|
opal_event_base_t *evb;
|
||||||
|
char *effective_dir = NULL;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||||
"%s local:launch",
|
"%s local:launch",
|
||||||
@ -945,7 +930,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
/* setup the working directory for this app - will jump us
|
/* setup the working directory for this app - will jump us
|
||||||
* to that directory
|
* to that directory
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (rc = setup_path(app))) {
|
if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s odls:launch:setup_path failed with error %s(%d)",
|
"%s odls:launch:setup_path failed with error %s(%d)",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1009,6 +994,15 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
goto GETOUT;
|
goto GETOUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* reset our working directory back to our default location - if we
|
||||||
|
* don't do this, then we will be looking for relative paths starting
|
||||||
|
* from the last wdir option specified by the user. Thus, we would
|
||||||
|
* be requiring that the user keep track on the cmd line of where
|
||||||
|
* each app was located relative to the prior app, instead of relative
|
||||||
|
* to their current location
|
||||||
|
*/
|
||||||
|
chdir(basedir);
|
||||||
|
|
||||||
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
|
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
|
||||||
for (idx=0; idx < orte_local_children->size; idx++) {
|
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||||
@ -1066,6 +1060,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
|
|
||||||
/* dispatch this child to the next available launch thread */
|
/* dispatch this child to the next available launch thread */
|
||||||
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
|
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
|
||||||
|
if (NULL != effective_dir) {
|
||||||
|
cd->wdir = strdup(effective_dir);
|
||||||
|
}
|
||||||
cd->jdata = jobdat;
|
cd->jdata = jobdat;
|
||||||
cd->app = app;
|
cd->app = app;
|
||||||
cd->child = child;
|
cd->child = child;
|
||||||
@ -1114,14 +1111,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
}
|
}
|
||||||
/* reset our working directory back to our default location - if we
|
if (NULL != effective_dir) {
|
||||||
* don't do this, then we will be looking for relative paths starting
|
free(effective_dir);
|
||||||
* from the last wdir option specified by the user. Thus, we would
|
}
|
||||||
* be requiring that the user keep track on the cmd line of where
|
|
||||||
* each app was located relative to the prior app, instead of relative
|
|
||||||
* to their current location
|
|
||||||
*/
|
|
||||||
chdir(basedir);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GETOUT:
|
GETOUT:
|
||||||
@ -1682,7 +1674,9 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
|||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
orte_job_t *jobdat;
|
orte_job_t *jobdat;
|
||||||
char basedir[MAXPATHLEN];
|
char basedir[MAXPATHLEN];
|
||||||
orte_iof_base_io_conf_t opts;
|
char *wdir = NULL;
|
||||||
|
orte_odls_spawn_caddy_t *cd;
|
||||||
|
opal_event_base_t *evb;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s odls:restart_proc for proc %s",
|
"%s odls:restart_proc for proc %s",
|
||||||
@ -1720,35 +1714,71 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* setup the path */
|
/* setup the path */
|
||||||
if (ORTE_SUCCESS != (rc = setup_path(app))) {
|
if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* dispatch this child to the next available launch thread */
|
||||||
|
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
|
||||||
|
if (NULL != wdir) {
|
||||||
|
cd->wdir = strdup(wdir);
|
||||||
|
free(wdir);
|
||||||
|
}
|
||||||
|
cd->jdata = jobdat;
|
||||||
|
cd->app = app;
|
||||||
|
cd->child = child;
|
||||||
|
cd->fork_local = fork_local;
|
||||||
/* setup any IOF */
|
/* setup any IOF */
|
||||||
memset(&opts, 0, sizeof(orte_iof_base_io_conf_t));
|
cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
||||||
|
|
||||||
|
/* do we want to setup stdin? */
|
||||||
|
if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
|
||||||
|
child->name.vpid == jobdat->stdin_target) {
|
||||||
|
cd->opts.connect_stdin = true;
|
||||||
|
} else {
|
||||||
|
cd->opts.connect_stdin = false;
|
||||||
|
}
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
child->exit_code = rc;
|
||||||
|
OBJ_RELEASE(cd);
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||||
|
goto CLEANUP;
|
||||||
|
}
|
||||||
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
/* connect endpoints IOF */
|
/* connect endpoints IOF */
|
||||||
rc = orte_iof_base_setup_parent(&child->name, &opts);
|
rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
OBJ_RELEASE(cd);
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||||
|
|
||||||
|
++orte_odls_globals.next_base;
|
||||||
|
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
|
||||||
|
orte_odls_globals.next_base = 0;
|
||||||
|
}
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s restarting app %s",
|
"%s restarting app %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
||||||
|
|
||||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
|
||||||
if (ORTE_SUCCESS != (rc = fork_local(child, app->app, app->argv, app->env, jobdat, opts))) {
|
opal_event_set(evb, &cd->ev, -1,
|
||||||
|
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
|
||||||
|
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
||||||
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (rc = fork_local(cd))) {
|
||||||
orte_wait_cb_cancel(child);
|
orte_wait_cb_cancel(child);
|
||||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||||
}
|
}
|
||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s odls:restart of proc %s %s",
|
"%s odls:restart of proc %s %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -239,7 +239,26 @@ OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
|
|||||||
static void sccon(orte_odls_spawn_caddy_t *p)
|
static void sccon(orte_odls_spawn_caddy_t *p)
|
||||||
{
|
{
|
||||||
memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
|
memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
|
||||||
|
p->cmd = NULL;
|
||||||
|
p->wdir = NULL;
|
||||||
|
p->argv = NULL;
|
||||||
|
p->env = NULL;
|
||||||
|
}
|
||||||
|
static void scdes(orte_odls_spawn_caddy_t *p)
|
||||||
|
{
|
||||||
|
if (NULL != p->cmd) {
|
||||||
|
free(p->cmd);
|
||||||
|
}
|
||||||
|
if (NULL != p->wdir) {
|
||||||
|
free(p->wdir);
|
||||||
|
}
|
||||||
|
if (NULL != p->argv) {
|
||||||
|
opal_argv_free(p->argv);
|
||||||
|
}
|
||||||
|
if (NULL != p->env) {
|
||||||
|
opal_argv_free(p->env);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
|
OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
|
||||||
opal_object_t,
|
opal_object_t,
|
||||||
sccon, NULL);
|
sccon, scdes);
|
||||||
|
@ -82,16 +82,16 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
ORTE_DECLSPEC void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata);
|
ORTE_DECLSPEC void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata);
|
||||||
|
|
||||||
/* define a function that will fork a local proc */
|
/* define a function that will fork a local proc */
|
||||||
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_proc_t *child,
|
typedef int (*orte_odls_base_fork_local_proc_fn_t)(void *cd);
|
||||||
char *app, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jdata,
|
|
||||||
orte_iof_base_io_conf_t opts);
|
|
||||||
|
|
||||||
/* define an object for fork/exec the local proc */
|
/* define an object for fork/exec the local proc */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
opal_object_t super;
|
opal_object_t super;
|
||||||
opal_event_t ev;
|
opal_event_t ev;
|
||||||
|
char *cmd;
|
||||||
|
char *wdir;
|
||||||
|
char **argv;
|
||||||
|
char **env;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
orte_proc_t *child;
|
orte_proc_t *child;
|
||||||
|
@ -145,11 +145,7 @@ static void send_error_show_help(int fd, int exit_status,
|
|||||||
const char *file, const char *topic, ...)
|
const char *file, const char *topic, ...)
|
||||||
__opal_attribute_noreturn__;
|
__opal_attribute_noreturn__;
|
||||||
|
|
||||||
static int do_child(orte_proc_t *child,
|
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||||
char *cmd, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat, int write_fd,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
__opal_attribute_noreturn__;
|
__opal_attribute_noreturn__;
|
||||||
|
|
||||||
|
|
||||||
@ -319,11 +315,7 @@ static int close_open_file_descriptors(int write_fd,
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int do_child(orte_proc_t *child,
|
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||||
char *app, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat, int write_fd,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
sigset_t sigs;
|
sigset_t sigs;
|
||||||
@ -339,7 +331,7 @@ static int do_child(orte_proc_t *child,
|
|||||||
/* Setup the pipe to be close-on-exec */
|
/* Setup the pipe to be close-on-exec */
|
||||||
opal_fd_set_cloexec(write_fd);
|
opal_fd_set_cloexec(write_fd);
|
||||||
|
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
/* setup stdout/stderr so that any error messages that we
|
/* setup stdout/stderr so that any error messages that we
|
||||||
may print out will get displayed back at orterun.
|
may print out will get displayed back at orterun.
|
||||||
|
|
||||||
@ -353,22 +345,21 @@ static int do_child(orte_proc_t *child,
|
|||||||
always outputs a nice, single message indicating what
|
always outputs a nice, single message indicating what
|
||||||
happened
|
happened
|
||||||
*/
|
*/
|
||||||
if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
|
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
|
||||||
&environ_copy))) {
|
|
||||||
ORTE_ERROR_LOG(i);
|
ORTE_ERROR_LOG(i);
|
||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-default.txt",
|
"help-orte-odls-default.txt",
|
||||||
"iof setup failed",
|
"iof setup failed",
|
||||||
orte_process_info.nodename, app);
|
orte_process_info.nodename, cd->app->app);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now set any child-level controls such as binding */
|
/* now set any child-level controls such as binding */
|
||||||
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
|
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
|
||||||
|
|
||||||
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
||||||
/* tie stdin/out/err/internal to /dev/null */
|
/* tie stdin/out/err/internal to /dev/null */
|
||||||
int fdnull;
|
int fdnull;
|
||||||
for (i=0; i < 3; i++) {
|
for (i=0; i < 3; i++) {
|
||||||
@ -379,8 +370,8 @@ static int do_child(orte_proc_t *child,
|
|||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||||
if (fdnull > opts.p_internal[1]) {
|
if (fdnull > cd->opts.p_internal[1]) {
|
||||||
dup2(fdnull, opts.p_internal[1]);
|
dup2(fdnull, cd->opts.p_internal[1]);
|
||||||
}
|
}
|
||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
@ -388,19 +379,19 @@ static int do_child(orte_proc_t *child,
|
|||||||
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
|
/* close all open file descriptors w/ exception of stdin/stdout/stderr,
|
||||||
the pipe used for the IOF INTERNAL messages, and the pipe up to
|
the pipe used for the IOF INTERNAL messages, and the pipe up to
|
||||||
the parent. */
|
the parent. */
|
||||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
|
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
|
||||||
// close *all* file descriptors -- slow
|
// close *all* file descriptors -- slow
|
||||||
for(fd=3; fd<fdmax; fd++) {
|
for(fd=3; fd<fdmax; fd++) {
|
||||||
if (fd != opts.p_internal[1] && fd != write_fd) {
|
if (fd != cd->opts.p_internal[1] && fd != write_fd) {
|
||||||
close(fd);
|
close(fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv == NULL) {
|
if (cd->argv == NULL) {
|
||||||
argv = malloc(sizeof(char*)*2);
|
cd->argv = malloc(sizeof(char*)*2);
|
||||||
argv[0] = strdup(app);
|
cd->argv[0] = strdup(cd->app->app);
|
||||||
argv[1] = NULL;
|
cd->argv[1] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set signal handlers back to the default. Do this close to
|
/* Set signal handlers back to the default. Do this close to
|
||||||
@ -423,31 +414,31 @@ static int do_child(orte_proc_t *child,
|
|||||||
sigprocmask(0, 0, &sigs);
|
sigprocmask(0, 0, &sigs);
|
||||||
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
||||||
|
|
||||||
/* Exec the new executable */
|
/* take us to the correct wdir */
|
||||||
|
if (NULL != cd->wdir) {
|
||||||
|
chdir(cd->wdir);
|
||||||
|
}
|
||||||
|
|
||||||
execve(app, argv, environ_copy);
|
/* Exec the new executable */
|
||||||
|
execve(cd->app->app, cd->argv, cd->env);
|
||||||
getcwd(dir, sizeof(dir));
|
getcwd(dir, sizeof(dir));
|
||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-default.txt", "execve error",
|
"help-orte-odls-default.txt", "execve error",
|
||||||
orte_process_info.nodename, dir, app, strerror(errno));
|
orte_process_info.nodename, dir, cd->app->app, strerror(errno));
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int do_parent(orte_proc_t *child,
|
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
|
||||||
char *app, char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat, int read_fd,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_odls_pipe_err_msg_t msg;
|
orte_odls_pipe_err_msg_t msg;
|
||||||
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
||||||
|
|
||||||
close(opts.p_stdin[0]);
|
close(cd->opts.p_stdin[0]);
|
||||||
close(opts.p_stdout[1]);
|
close(cd->opts.p_stdout[1]);
|
||||||
close(opts.p_stderr[1]);
|
close(cd->opts.p_stderr[1]);
|
||||||
close(opts.p_internal[1]);
|
close(cd->opts.p_internal[1]);
|
||||||
|
|
||||||
/* Block reading a message from the pipe */
|
/* Block reading a message from the pipe */
|
||||||
while (1) {
|
while (1) {
|
||||||
@ -463,18 +454,18 @@ static int do_parent(orte_proc_t *child,
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
close(read_fd);
|
close(read_fd);
|
||||||
|
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Otherwise, we got a warning or error message from the child */
|
/* Otherwise, we got a warning or error message from the child */
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
if (msg.fatal) {
|
if (msg.fatal) {
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
} else {
|
} else {
|
||||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -484,10 +475,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app->app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -498,10 +489,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app->app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -512,10 +503,10 @@ static int do_parent(orte_proc_t *child,
|
|||||||
if (NULL == str) {
|
if (NULL == str) {
|
||||||
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
orte_show_help("help-orte-odls-default.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, app,
|
orte_process_info.nodename, cd->app->app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
cd->child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -536,9 +527,9 @@ static int do_parent(orte_proc_t *child,
|
|||||||
closed, indicating that the child launched
|
closed, indicating that the child launched
|
||||||
successfully). */
|
successfully). */
|
||||||
if (msg.fatal) {
|
if (msg.fatal) {
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
}
|
}
|
||||||
close(read_fd);
|
close(read_fd);
|
||||||
return ORTE_ERR_FAILED_TO_START;
|
return ORTE_ERR_FAILED_TO_START;
|
||||||
@ -548,9 +539,9 @@ static int do_parent(orte_proc_t *child,
|
|||||||
/* If we got here, it means that the pipe closed without
|
/* If we got here, it means that the pipe closed without
|
||||||
indication of a fatal error, meaning that the child process
|
indication of a fatal error, meaning that the child process
|
||||||
launched successfully. */
|
launched successfully. */
|
||||||
if (NULL != child) {
|
if (NULL != cd->child) {
|
||||||
child->state = ORTE_PROC_STATE_RUNNING;
|
cd->child->state = ORTE_PROC_STATE_RUNNING;
|
||||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
|
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
|
||||||
}
|
}
|
||||||
close(read_fd);
|
close(read_fd);
|
||||||
|
|
||||||
@ -561,15 +552,12 @@ static int do_parent(orte_proc_t *child,
|
|||||||
/**
|
/**
|
||||||
* Fork/exec the specified processes
|
* Fork/exec the specified processes
|
||||||
*/
|
*/
|
||||||
static int odls_default_fork_local_proc(orte_proc_t *child,
|
static int odls_default_fork_local_proc(void *cdptr)
|
||||||
char *app,
|
|
||||||
char **argv,
|
|
||||||
char **environ_copy,
|
|
||||||
orte_job_t *jobdat,
|
|
||||||
orte_iof_base_io_conf_t opts)
|
|
||||||
{
|
{
|
||||||
|
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
|
||||||
int p[2];
|
int p[2];
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
|
orte_proc_t *child = cd->child;
|
||||||
|
|
||||||
/* A pipe is used to communicate between the parent and child to
|
/* A pipe is used to communicate between the parent and child to
|
||||||
indicate whether the exec ultimately succeeded or failed. The
|
indicate whether the exec ultimately succeeded or failed. The
|
||||||
@ -605,12 +593,12 @@ static int odls_default_fork_local_proc(orte_proc_t *child,
|
|||||||
|
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
close(p[0]);
|
close(p[0]);
|
||||||
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
|
do_child(cd, p[1]);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
close(p[1]);
|
close(p[1]);
|
||||||
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
|
return do_parent(cd, p[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user