1
1

Merge pull request #3146 from rhc54/topic/alps

Update alps module to new APIs
Этот коммит содержится в:
Ralph Castain 2017-03-12 10:35:29 -07:00 коммит произвёл GitHub
родитель fb27bd1b4a 6d6bc9bd07
Коммит 59bcad5f8e
4 изменённых файлов: 38 добавлений и 82 удалений

Просмотреть файл

@ -2,11 +2,11 @@ enable_orterun_prefix_by_default=yes
enable_mpi_thread_multiple=no enable_mpi_thread_multiple=no
enable_mem_debug=no enable_mem_debug=no
enable_mem_profile=no enable_mem_profile=no
enable_debug_symbols=yes enable_debug_symbols=no
enable_binaries=yes enable_binaries=yes
enable_heterogeneous=no enable_heterogeneous=no
enable_picky=yes enable_picky=yes
enable_debug=yes enable_debug=no
enable_shared=yes enable_shared=yes
enable_static=yes enable_static=yes
enable_memchecker=no enable_memchecker=no

Просмотреть файл

@ -63,4 +63,4 @@
mca_base_component_show_load_errors = 1 mca_base_component_show_load_errors = 1
orte_abort_timeout = 10 orte_abort_timeout = 10
hwloc_base_mem_bind_failure_action = silent hwloc_base_mem_bind_failure_action = silent
btl_ugni_rcache=grdma

Просмотреть файл

@ -151,7 +151,7 @@ if ($myresults) {
# determine the number of nodes - doesn't # determine the number of nodes - doesn't
# matter which starter we use # matter which starter we use
$cmd = "mpirun --novm --pernode hostname"; $cmd = "mpirun --pernode hostname";
$output = `$cmd`; $output = `$cmd`;
@lines = split(/\n/, $output); @lines = split(/\n/, $output);
$num_nodes = $#lines + 1; $num_nodes = $#lines + 1;

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -144,8 +144,8 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
static void send_error_show_help(int fd, int exit_status, static void send_error_show_help(int fd, int exit_status,
const char *file, const char *topic, ...) const char *file, const char *topic, ...)
__opal_attribute_noreturn__; __opal_attribute_noreturn__;
static int do_child(orte_app_context_t* context, static int do_child(orte_proc_t *child,
orte_proc_t *child, char *app, char **argv,
char **environ_copy, char **environ_copy,
orte_job_t *jobdat, int write_fd, orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts) orte_iof_base_io_conf_t opts)
@ -342,8 +342,8 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int do_child(orte_app_context_t* context, static int do_child( orte_proc_t *child,
orte_proc_t *child, char *app, char **argv,
char **environ_copy, char **environ_copy,
orte_job_t *jobdat, int write_fd, orte_job_t *jobdat, int write_fd,
orte_iof_base_io_conf_t opts) orte_iof_base_io_conf_t opts)
@ -375,7 +375,7 @@ static int do_child(orte_app_context_t* context,
send_error_show_help(write_fd, 1, send_error_show_help(write_fd, 1,
"help-orte-odls-alps.txt", "help-orte-odls-alps.txt",
"iof setup failed", "iof setup failed",
orte_process_info.nodename, context->app); orte_process_info.nodename, app);
/* Does not return */ /* Does not return */
} }
@ -399,30 +399,18 @@ static int do_child(orte_app_context_t* context,
close(fdnull); close(fdnull);
} }
/* if the user requested it, set the system resource limits */
if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
"set limit",
orte_process_info.nodename, context->app,
__FILE__, __LINE__, msg);
}
/* ensure we only do this once */
(void) mca_base_var_env_name("opal_set_max_sys_limits", &param);
opal_unsetenv(param, &environ_copy);
free(param);
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) { if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
"close fds", "close fds",
orte_process_info.nodename, context->app, orte_process_info.nodename, app,
__FILE__, __LINE__); __FILE__, __LINE__);
} }
if (context->argv == NULL) { if (argv == NULL) {
context->argv = malloc(sizeof(char*)*2); argv = malloc(sizeof(char*)*2);
context->argv[0] = strdup(context->app); argv[0] = strdup(app);
context->argv[1] = NULL; argv[1] = NULL;
} }
/* Set signal handlers back to the default. Do this close to /* Set signal handlers back to the default. Do this close to
@ -449,25 +437,25 @@ static int do_child(orte_app_context_t* context,
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
int jout; int jout;
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app); opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
for (jout=0; NULL != context->argv[jout]; jout++) { for (jout=0; NULL != argv[jout]; jout++) {
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]); opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
} }
for (jout=0; NULL != environ_copy[jout]; jout++) { for (jout=0; NULL != environ_copy[jout]; jout++) {
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]); opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
} }
} }
execve(context->app, context->argv, environ_copy); execve(app, argv, environ_copy);
send_error_show_help(write_fd, 1, send_error_show_help(write_fd, 1,
"help-orte-odls-alps.txt", "execve error", "help-orte-odls-alps.txt", "execve error",
orte_process_info.nodename, context->app, strerror(errno)); orte_process_info.nodename, app, strerror(errno));
/* Does not return */ /* Does not return */
} }
static int do_parent(orte_app_context_t* context, static int do_parent(orte_proc_t *child,
orte_proc_t *child, char *app, char **argv,
char **environ_copy, char **environ_copy,
orte_job_t *jobdat, int read_fd, orte_job_t *jobdat, int read_fd,
orte_iof_base_io_conf_t opts) orte_iof_base_io_conf_t opts)
@ -476,19 +464,10 @@ static int do_parent(orte_app_context_t* context,
orte_odls_pipe_err_msg_t msg; orte_odls_pipe_err_msg_t msg;
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { close(opts.p_stdin[0]);
/* connect endpoints IOF */ close(opts.p_stdout[1]);
rc = orte_iof_base_setup_parent(&child->name, &opts); close(opts.p_stderr[1]);
if (ORTE_SUCCESS != rc) { close(opts.p_internal[1]);
ORTE_ERROR_LOG(rc);
close(read_fd);
if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF;
}
return rc;
}
}
/* Block reading a message from the pipe */ /* Block reading a message from the pipe */
while (1) { while (1) {
@ -525,7 +504,7 @@ static int do_parent(orte_app_context_t* context,
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail", orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true, true,
orte_process_info.nodename, context->app, orte_process_info.nodename, app,
"opal_fd_read", __FILE__, __LINE__); "opal_fd_read", __FILE__, __LINE__);
if (NULL != child) { if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF; child->state = ORTE_PROC_STATE_UNDEF;
@ -539,7 +518,7 @@ static int do_parent(orte_app_context_t* context,
if (OPAL_SUCCESS != rc) { if (OPAL_SUCCESS != rc) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail", orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true, true,
orte_process_info.nodename, context->app, orte_process_info.nodename, app,
"opal_fd_read", __FILE__, __LINE__); "opal_fd_read", __FILE__, __LINE__);
if (NULL != child) { if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF; child->state = ORTE_PROC_STATE_UNDEF;
@ -553,7 +532,7 @@ static int do_parent(orte_app_context_t* context,
if (NULL == str) { if (NULL == str) {
orte_show_help("help-orte-odls-alps.txt", "syscall fail", orte_show_help("help-orte-odls-alps.txt", "syscall fail",
true, true,
orte_process_info.nodename, context->app, orte_process_info.nodename, app,
"opal_fd_read", __FILE__, __LINE__); "opal_fd_read", __FILE__, __LINE__);
if (NULL != child) { if (NULL != child) {
child->state = ORTE_PROC_STATE_UNDEF; child->state = ORTE_PROC_STATE_UNDEF;
@ -602,39 +581,16 @@ static int do_parent(orte_app_context_t* context,
/** /**
* Fork/exec the specified processes * Fork/exec the specified processes
*/ */
static int odls_alps_fork_local_proc(orte_app_context_t* context, static int odls_alps_fork_local_proc(orte_proc_t *child,
orte_proc_t *child, char *app,
char **environ_copy, char **argv,
orte_job_t *jobdat) char **environ_copy,
orte_job_t *jobdat,
orte_iof_base_io_conf_t opts)
{ {
orte_iof_base_io_conf_t opts;
int rc, p[2]; int rc, p[2];
pid_t pid; pid_t pid;
if (NULL != child) {
/* should pull this information from MPIRUN instead of going with
default */
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
/* do we want to setup stdin? */
if (NULL != child &&
(jobdat->stdin_target == ORTE_VPID_WILDCARD ||
child->name.vpid == jobdat->stdin_target)) {
opts.connect_stdin = true;
} else {
opts.connect_stdin = false;
}
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) {
ORTE_ERROR_LOG(rc);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = rc;
}
return rc;
}
}
/* A pipe is used to communicate between the parent and child to /* A pipe is used to communicate between the parent and child to
indicate whether the exec ultimately succeeded or failed. The indicate whether the exec ultimately succeeded or failed. The
child sets the pipe to be close-on-exec; the child only ever child sets the pipe to be close-on-exec; the child only ever
@ -668,16 +624,16 @@ static int odls_alps_fork_local_proc(orte_app_context_t* context,
} }
if (pid == 0) { if (pid == 0) {
close(p[0]); close(p[0]);
#if HAVE_SETPGID #if HAVE_SETPGID
setpgid(0, 0); setpgid(0, 0);
#endif #endif
do_child(context, child, environ_copy, jobdat, p[1], opts); do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
/* Does not return */ /* Does not return */
} }
close(p[1]); close(p[1]);
return do_parent(context, child, environ_copy, jobdat, p[0], opts); return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
} }