Merge pull request #3146 from rhc54/topic/alps
Update alps module to new APIs
Этот коммит содержится в:
Коммит
59bcad5f8e
@ -2,11 +2,11 @@ enable_orterun_prefix_by_default=yes
|
|||||||
enable_mpi_thread_multiple=no
|
enable_mpi_thread_multiple=no
|
||||||
enable_mem_debug=no
|
enable_mem_debug=no
|
||||||
enable_mem_profile=no
|
enable_mem_profile=no
|
||||||
enable_debug_symbols=yes
|
enable_debug_symbols=no
|
||||||
enable_binaries=yes
|
enable_binaries=yes
|
||||||
enable_heterogeneous=no
|
enable_heterogeneous=no
|
||||||
enable_picky=yes
|
enable_picky=yes
|
||||||
enable_debug=yes
|
enable_debug=no
|
||||||
enable_shared=yes
|
enable_shared=yes
|
||||||
enable_static=yes
|
enable_static=yes
|
||||||
enable_memchecker=no
|
enable_memchecker=no
|
||||||
|
@ -63,4 +63,4 @@
|
|||||||
mca_base_component_show_load_errors = 1
|
mca_base_component_show_load_errors = 1
|
||||||
orte_abort_timeout = 10
|
orte_abort_timeout = 10
|
||||||
hwloc_base_mem_bind_failure_action = silent
|
hwloc_base_mem_bind_failure_action = silent
|
||||||
|
btl_ugni_rcache=grdma
|
||||||
|
@ -151,7 +151,7 @@ if ($myresults) {
|
|||||||
|
|
||||||
# determine the number of nodes - doesn't
|
# determine the number of nodes - doesn't
|
||||||
# matter which starter we use
|
# matter which starter we use
|
||||||
$cmd = "mpirun --novm --pernode hostname";
|
$cmd = "mpirun --pernode hostname";
|
||||||
$output = `$cmd`;
|
$output = `$cmd`;
|
||||||
@lines = split(/\n/, $output);
|
@lines = split(/\n/, $output);
|
||||||
$num_nodes = $#lines + 1;
|
$num_nodes = $#lines + 1;
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -144,8 +144,8 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
|
|||||||
static void send_error_show_help(int fd, int exit_status,
|
static void send_error_show_help(int fd, int exit_status,
|
||||||
const char *file, const char *topic, ...)
|
const char *file, const char *topic, ...)
|
||||||
__opal_attribute_noreturn__;
|
__opal_attribute_noreturn__;
|
||||||
static int do_child(orte_app_context_t* context,
|
static int do_child(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat, int write_fd,
|
orte_job_t *jobdat, int write_fd,
|
||||||
orte_iof_base_io_conf_t opts)
|
orte_iof_base_io_conf_t opts)
|
||||||
@ -342,8 +342,8 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int do_child(orte_app_context_t* context,
|
static int do_child( orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat, int write_fd,
|
orte_job_t *jobdat, int write_fd,
|
||||||
orte_iof_base_io_conf_t opts)
|
orte_iof_base_io_conf_t opts)
|
||||||
@ -375,7 +375,7 @@ static int do_child(orte_app_context_t* context,
|
|||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-alps.txt",
|
"help-orte-odls-alps.txt",
|
||||||
"iof setup failed",
|
"iof setup failed",
|
||||||
orte_process_info.nodename, context->app);
|
orte_process_info.nodename, app);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -399,30 +399,18 @@ static int do_child(orte_app_context_t* context,
|
|||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if the user requested it, set the system resource limits */
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
|
|
||||||
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
|
|
||||||
"set limit",
|
|
||||||
orte_process_info.nodename, context->app,
|
|
||||||
__FILE__, __LINE__, msg);
|
|
||||||
}
|
|
||||||
/* ensure we only do this once */
|
|
||||||
(void) mca_base_var_env_name("opal_set_max_sys_limits", ¶m);
|
|
||||||
opal_unsetenv(param, &environ_copy);
|
|
||||||
free(param);
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
|
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
|
||||||
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
|
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
|
||||||
"close fds",
|
"close fds",
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
__FILE__, __LINE__);
|
__FILE__, __LINE__);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (context->argv == NULL) {
|
if (argv == NULL) {
|
||||||
context->argv = malloc(sizeof(char*)*2);
|
argv = malloc(sizeof(char*)*2);
|
||||||
context->argv[0] = strdup(context->app);
|
argv[0] = strdup(app);
|
||||||
context->argv[1] = NULL;
|
argv[1] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set signal handlers back to the default. Do this close to
|
/* Set signal handlers back to the default. Do this close to
|
||||||
@ -449,25 +437,25 @@ static int do_child(orte_app_context_t* context,
|
|||||||
|
|
||||||
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
|
||||||
int jout;
|
int jout;
|
||||||
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app);
|
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
|
||||||
for (jout=0; NULL != context->argv[jout]; jout++) {
|
for (jout=0; NULL != argv[jout]; jout++) {
|
||||||
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]);
|
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
|
||||||
}
|
}
|
||||||
for (jout=0; NULL != environ_copy[jout]; jout++) {
|
for (jout=0; NULL != environ_copy[jout]; jout++) {
|
||||||
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
|
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
execve(context->app, context->argv, environ_copy);
|
execve(app, argv, environ_copy);
|
||||||
send_error_show_help(write_fd, 1,
|
send_error_show_help(write_fd, 1,
|
||||||
"help-orte-odls-alps.txt", "execve error",
|
"help-orte-odls-alps.txt", "execve error",
|
||||||
orte_process_info.nodename, context->app, strerror(errno));
|
orte_process_info.nodename, app, strerror(errno));
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int do_parent(orte_app_context_t* context,
|
static int do_parent(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app, char **argv,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_t *jobdat, int read_fd,
|
orte_job_t *jobdat, int read_fd,
|
||||||
orte_iof_base_io_conf_t opts)
|
orte_iof_base_io_conf_t opts)
|
||||||
@ -476,19 +464,10 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
orte_odls_pipe_err_msg_t msg;
|
orte_odls_pipe_err_msg_t msg;
|
||||||
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
|
||||||
|
|
||||||
if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
|
close(opts.p_stdin[0]);
|
||||||
/* connect endpoints IOF */
|
close(opts.p_stdout[1]);
|
||||||
rc = orte_iof_base_setup_parent(&child->name, &opts);
|
close(opts.p_stderr[1]);
|
||||||
if (ORTE_SUCCESS != rc) {
|
close(opts.p_internal[1]);
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
close(read_fd);
|
|
||||||
|
|
||||||
if (NULL != child) {
|
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Block reading a message from the pipe */
|
/* Block reading a message from the pipe */
|
||||||
while (1) {
|
while (1) {
|
||||||
@ -525,7 +504,7 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
@ -539,7 +518,7 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
@ -553,7 +532,7 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
if (NULL == str) {
|
if (NULL == str) {
|
||||||
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename, context->app,
|
orte_process_info.nodename, app,
|
||||||
"opal_fd_read", __FILE__, __LINE__);
|
"opal_fd_read", __FILE__, __LINE__);
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
child->state = ORTE_PROC_STATE_UNDEF;
|
child->state = ORTE_PROC_STATE_UNDEF;
|
||||||
@ -602,39 +581,16 @@ static int do_parent(orte_app_context_t* context,
|
|||||||
/**
|
/**
|
||||||
* Fork/exec the specified processes
|
* Fork/exec the specified processes
|
||||||
*/
|
*/
|
||||||
static int odls_alps_fork_local_proc(orte_app_context_t* context,
|
static int odls_alps_fork_local_proc(orte_proc_t *child,
|
||||||
orte_proc_t *child,
|
char *app,
|
||||||
char **environ_copy,
|
char **argv,
|
||||||
orte_job_t *jobdat)
|
char **environ_copy,
|
||||||
|
orte_job_t *jobdat,
|
||||||
|
orte_iof_base_io_conf_t opts)
|
||||||
{
|
{
|
||||||
orte_iof_base_io_conf_t opts;
|
|
||||||
int rc, p[2];
|
int rc, p[2];
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
|
|
||||||
if (NULL != child) {
|
|
||||||
/* should pull this information from MPIRUN instead of going with
|
|
||||||
default */
|
|
||||||
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
|
||||||
|
|
||||||
/* do we want to setup stdin? */
|
|
||||||
if (NULL != child &&
|
|
||||||
(jobdat->stdin_target == ORTE_VPID_WILDCARD ||
|
|
||||||
child->name.vpid == jobdat->stdin_target)) {
|
|
||||||
opts.connect_stdin = true;
|
|
||||||
} else {
|
|
||||||
opts.connect_stdin = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
if (NULL != child) {
|
|
||||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
||||||
child->exit_code = rc;
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* A pipe is used to communicate between the parent and child to
|
/* A pipe is used to communicate between the parent and child to
|
||||||
indicate whether the exec ultimately succeeded or failed. The
|
indicate whether the exec ultimately succeeded or failed. The
|
||||||
child sets the pipe to be close-on-exec; the child only ever
|
child sets the pipe to be close-on-exec; the child only ever
|
||||||
@ -668,16 +624,16 @@ static int odls_alps_fork_local_proc(orte_app_context_t* context,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
close(p[0]);
|
close(p[0]);
|
||||||
#if HAVE_SETPGID
|
#if HAVE_SETPGID
|
||||||
setpgid(0, 0);
|
setpgid(0, 0);
|
||||||
#endif
|
#endif
|
||||||
do_child(context, child, environ_copy, jobdat, p[1], opts);
|
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
|
||||||
/* Does not return */
|
/* Does not return */
|
||||||
}
|
}
|
||||||
|
|
||||||
close(p[1]);
|
close(p[1]);
|
||||||
return do_parent(context, child, environ_copy, jobdat, p[0], opts);
|
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user