1
1
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-03-21 17:41:11 -06:00
родитель 75684dc260
Коммит 74fd2c30af

Просмотреть файл

@ -144,11 +144,7 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
static void send_error_show_help(int fd, int exit_status, static void send_error_show_help(int fd, int exit_status,
const char *file, const char *topic, ...) const char *file, const char *topic, ...)
__opal_attribute_noreturn__; __opal_attribute_noreturn__;
static int do_child(orte_proc_t *child, static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts)
__opal_attribute_noreturn__; __opal_attribute_noreturn__;
@ -344,9 +340,8 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd) static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{ {
int i, rc; int i;
sigset_t sigs; sigset_t sigs;
char *param, *msg;
/* Setup the pipe to be close-on-exec */ /* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd); opal_fd_set_cloexec(write_fd);
@ -449,20 +444,16 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
} }
static int do_parent(orte_proc_t *child, static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
char *app, char **argv,
char **environ_copy,
orte_job_t *jobdat, int read_fd,
orte_iof_base_io_conf_t opts)
{ {
int rc; int rc;
orte_odls_pipe_err_msg_t msg; orte_odls_pipe_err_msg_t msg;
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
close(opts.p_stdin[0]); close(cd->opts.p_stdin[0]);
close(opts.p_stdout[1]); close(cd->opts.p_stdout[1]);
close(opts.p_stderr[1]); close(cd->opts.p_stderr[1]);
close(opts.p_internal[1]); close(cd->opts.p_internal[1]);
/* Block reading a message from the pipe */ /* Block reading a message from the pipe */
while (1) { while (1) {
@ -478,18 +469,18 @@ static int do_parent(orte_proc_t *child,
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
close(read_fd); close(read_fd);
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_UNDEF; cd->child->state = ORTE_PROC_STATE_UNDEF;
} }
return rc; return rc;
} }
/* Otherwise, we got a warning or error message from the child */ /* Otherwise, we got a warning or error message from the child */
if (NULL != child) { if (NULL != cd->child) {
if (msg.fatal) { if (msg.fatal) {
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
} else { } else {
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE); ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
} }
} }
@ -499,10 +490,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail", orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true, true,
orte_process_info.nodename, app, orte_process_info.nodename, cd->app,
"opal_fd_read", __FILE__, __LINE__); "opal_fd_read", __FILE__, __LINE__);
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_UNDEF; cd->child->state = ORTE_PROC_STATE_UNDEF;
} }
return rc; return rc;
} }
@ -513,10 +504,10 @@ static int do_parent(orte_proc_t *child,
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail", orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true, true,
orte_process_info.nodename, app, orte_process_info.nodename, cd->app,
"opal_fd_read", __FILE__, __LINE__); "opal_fd_read", __FILE__, __LINE__);
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_UNDEF; cd->child->state = ORTE_PROC_STATE_UNDEF;
} }
return rc; return rc;
} }
@ -527,10 +518,10 @@ static int do_parent(orte_proc_t *child,
if (NULL == str) { if (NULL == str) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail", orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true, true,
orte_process_info.nodename, app, orte_process_info.nodename, cd->app,
"opal_fd_read", __FILE__, __LINE__); "opal_fd_read", __FILE__, __LINE__);
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_UNDEF; cd->child->state = ORTE_PROC_STATE_UNDEF;
} }
return rc; return rc;
} }
@ -551,9 +542,9 @@ static int do_parent(orte_proc_t *child,
closed, indicating that the child launched closed, indicating that the child launched
successfully). */ successfully). */
if (msg.fatal) { if (msg.fatal) {
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START; cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
} }
close(read_fd); close(read_fd);
return ORTE_ERR_FAILED_TO_START; return ORTE_ERR_FAILED_TO_START;
@ -563,9 +554,9 @@ static int do_parent(orte_proc_t *child,
/* If we got here, it means that the pipe closed without /* If we got here, it means that the pipe closed without
indication of a fatal error, meaning that the child process indication of a fatal error, meaning that the child process
launched successfully. */ launched successfully. */
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_RUNNING; cd->child->state = ORTE_PROC_STATE_RUNNING;
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE); ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
} }
close(read_fd); close(read_fd);
@ -576,14 +567,10 @@ static int do_parent(orte_proc_t *child,
/** /**
* Fork/exec the specified processes * Fork/exec the specified processes
*/ */
static int odls_alps_fork_local_proc(orte_proc_t *child, static int odls_alps_fork_local_proc(void *cdptr)
char *app,
char **argv,
char **environ_copy,
orte_job_t *jobdat,
orte_iof_base_io_conf_t opts)
{ {
int rc, p[2]; orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
int p[2];
pid_t pid; pid_t pid;
/* A pipe is used to communicate between the parent and child to /* A pipe is used to communicate between the parent and child to
@ -596,24 +583,24 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
the pipe, then the child was letting us know why it failed. */ the pipe, then the child was letting us know why it failed. */
if (pipe(p) < 0) { if (pipe(p) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START; cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES; cd->child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
} }
return ORTE_ERR_SYS_LIMITS_PIPES; return ORTE_ERR_SYS_LIMITS_PIPES;
} }
/* Fork off the child */ /* Fork off the child */
pid = fork(); pid = fork();
if (NULL != child) { if (NULL != cd->child) {
child->pid = pid; cd->child->pid = pid;
} }
if (pid < 0) { if (pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
if (NULL != child) { if (NULL != cd->child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START; cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN; cd->child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
} }
return ORTE_ERR_SYS_LIMITS_CHILDREN; return ORTE_ERR_SYS_LIMITS_CHILDREN;
} }
@ -623,12 +610,12 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
#if HAVE_SETPGID #if HAVE_SETPGID
setpgid(0, 0); setpgid(0, 0);
#endif #endif
do_child(child, app, argv, environ_copy, jobdat, p[1], opts); do_child(cd, p[1]);
/* Does not return */ /* Does not return */
} }
close(p[1]); close(p[1]);
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts); return do_parent(cd, p[0]);
} }
@ -638,8 +625,8 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
int orte_odls_alps_launch_local_procs(opal_buffer_t *data) int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
{ {
int rc;
orte_jobid_t job; orte_jobid_t job;
int rc;
/* construct the list of children we are to launch */ /* construct the list of children we are to launch */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) { if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {