From 6d6bc9bd07f91318e5068d2b6e0870ad17ad9670 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 12 Mar 2017 08:31:08 -0700 Subject: [PATCH] Update alps module to new APIs Signed-off-by: Ralph Castain --- contrib/platform/intel/bend/gadget | 4 +- contrib/platform/intel/bend/gadget.conf | 2 +- contrib/scaling/scaling.pl | 2 +- orte/mca/odls/alps/odls_alps_module.c | 112 +++++++----------------- 4 files changed, 38 insertions(+), 82 deletions(-) diff --git a/contrib/platform/intel/bend/gadget b/contrib/platform/intel/bend/gadget index eff8652b31..74af16639d 100644 --- a/contrib/platform/intel/bend/gadget +++ b/contrib/platform/intel/bend/gadget @@ -2,11 +2,11 @@ enable_orterun_prefix_by_default=yes enable_mpi_thread_multiple=no enable_mem_debug=no enable_mem_profile=no -enable_debug_symbols=yes +enable_debug_symbols=no enable_binaries=yes enable_heterogeneous=no enable_picky=yes -enable_debug=yes +enable_debug=no enable_shared=yes enable_static=yes enable_memchecker=no diff --git a/contrib/platform/intel/bend/gadget.conf b/contrib/platform/intel/bend/gadget.conf index 5141b6b727..0a2e074503 100644 --- a/contrib/platform/intel/bend/gadget.conf +++ b/contrib/platform/intel/bend/gadget.conf @@ -63,4 +63,4 @@ mca_base_component_show_load_errors = 1 orte_abort_timeout = 10 hwloc_base_mem_bind_failure_action = silent - +btl_ugni_rcache=grdma diff --git a/contrib/scaling/scaling.pl b/contrib/scaling/scaling.pl index 1aff137fad..27f03ef623 100755 --- a/contrib/scaling/scaling.pl +++ b/contrib/scaling/scaling.pl @@ -151,7 +151,7 @@ if ($myresults) { # determine the number of nodes - doesn't # matter which starter we use -$cmd = "mpirun --novm --pernode hostname"; +$cmd = "mpirun --pernode hostname"; $output = `$cmd`; @lines = split(/\n/, $output); $num_nodes = $#lines + 1; diff --git a/orte/mca/odls/alps/odls_alps_module.c b/orte/mca/odls/alps/odls_alps_module.c index c8648642bb..f1eaa29ff2 100644 --- a/orte/mca/odls/alps/odls_alps_module.c +++ b/orte/mca/odls/alps/odls_alps_module.c @@ -15,7 +15,7 @@ * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -144,8 +144,8 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child); static void send_error_show_help(int fd, int exit_status, const char *file, const char *topic, ...) __opal_attribute_noreturn__; -static int do_child(orte_app_context_t* context, - orte_proc_t *child, +static int do_child(orte_proc_t *child, + char *app, char **argv, char **environ_copy, orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) @@ -342,8 +342,8 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt return ORTE_SUCCESS; } -static int do_child(orte_app_context_t* context, - orte_proc_t *child, +static int do_child( orte_proc_t *child, + char *app, char **argv, char **environ_copy, orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) @@ -375,7 +375,7 @@ static int do_child(orte_app_context_t* context, send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "iof setup failed", - orte_process_info.nodename, context->app); + orte_process_info.nodename, app); /* Does not return */ } @@ -399,30 +399,18 @@ static int do_child(orte_app_context_t* context, close(fdnull); } - /* if the user requested it, set the system resource limits */ - if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) { - send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", - "set limit", - orte_process_info.nodename, context->app, - __FILE__, __LINE__, msg); - } - /* ensure we only do this once */ - (void) mca_base_var_env_name("opal_set_max_sys_limits", ¶m); - opal_unsetenv(param, &environ_copy); - free(param); - if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) { send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "close fds", - orte_process_info.nodename, context->app, + orte_process_info.nodename, app, __FILE__, __LINE__); } - if (context->argv == NULL) { - context->argv = malloc(sizeof(char*)*2); - context->argv[0] = strdup(context->app); - context->argv[1] = NULL; + if (argv == NULL) { + argv = malloc(sizeof(char*)*2); + argv[0] = strdup(app); + argv[1] = NULL; } /* Set signal handlers back to the default. Do this close to @@ -449,25 +437,25 @@ static int do_child(orte_app_context_t* context, if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { int jout; - opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app); - for (jout=0; NULL != context->argv[jout]; jout++) { - opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]); + opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app); + for (jout=0; NULL != argv[jout]; jout++) { + opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]); } for (jout=0; NULL != environ_copy[jout]; jout++) { opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]); } } - execve(context->app, context->argv, environ_copy); + execve(app, argv, environ_copy); send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "execve error", - orte_process_info.nodename, context->app, strerror(errno)); + orte_process_info.nodename, app, strerror(errno)); /* Does not return */ } -static int do_parent(orte_app_context_t* context, - orte_proc_t *child, +static int do_parent(orte_proc_t *child, + char *app, char **argv, char **environ_copy, orte_job_t *jobdat, int read_fd, orte_iof_base_io_conf_t opts) @@ -476,19 +464,10 @@ static int do_parent(orte_app_context_t* context, orte_odls_pipe_err_msg_t msg; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; - if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { - /* connect endpoints IOF */ - rc = orte_iof_base_setup_parent(&child->name, &opts); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - close(read_fd); - - if (NULL != child) { - child->state = ORTE_PROC_STATE_UNDEF; - } - return rc; - } - } + close(opts.p_stdin[0]); + close(opts.p_stdout[1]); + close(opts.p_stderr[1]); + close(opts.p_internal[1]); /* Block reading a message from the pipe */ while (1) { @@ -525,7 +504,7 @@ static int do_parent(orte_app_context_t* context, if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, - orte_process_info.nodename, context->app, + orte_process_info.nodename, app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; @@ -539,7 +518,7 @@ static int do_parent(orte_app_context_t* context, if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, - orte_process_info.nodename, context->app, + orte_process_info.nodename, app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; @@ -553,7 +532,7 @@ static int do_parent(orte_app_context_t* context, if (NULL == str) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, - orte_process_info.nodename, context->app, + orte_process_info.nodename, app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; @@ -602,39 +581,16 @@ static int do_parent(orte_app_context_t* context, /** * Fork/exec the specified processes */ -static int odls_alps_fork_local_proc(orte_app_context_t* context, - orte_proc_t *child, - char **environ_copy, - orte_job_t *jobdat) +static int odls_alps_fork_local_proc(orte_proc_t *child, + char *app, + char **argv, + char **environ_copy, + orte_job_t *jobdat, + orte_iof_base_io_conf_t opts) { - orte_iof_base_io_conf_t opts; int rc, p[2]; pid_t pid; - if (NULL != child) { - /* should pull this information from MPIRUN instead of going with - default */ - opts.usepty = OPAL_ENABLE_PTY_SUPPORT; - - /* do we want to setup stdin? */ - if (NULL != child && - (jobdat->stdin_target == ORTE_VPID_WILDCARD || - child->name.vpid == jobdat->stdin_target)) { - opts.connect_stdin = true; - } else { - opts.connect_stdin = false; - } - - if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) { - ORTE_ERROR_LOG(rc); - if (NULL != child) { - child->state = ORTE_PROC_STATE_FAILED_TO_START; - child->exit_code = rc; - } - return rc; - } - } - /* A pipe is used to communicate between the parent and child to indicate whether the exec ultimately succeeded or failed. The child sets the pipe to be close-on-exec; the child only ever @@ -668,16 +624,16 @@ static int odls_alps_fork_local_proc(orte_app_context_t* context, } if (pid == 0) { - close(p[0]); + close(p[0]); #if HAVE_SETPGID setpgid(0, 0); #endif - do_child(context, child, environ_copy, jobdat, p[1], opts); + do_child(child, app, argv, environ_copy, jobdat, p[1], opts); /* Does not return */ } close(p[1]); - return do_parent(context, child, environ_copy, jobdat, p[0], opts); + return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts); }