diff --git a/orte/mca/odls/bproc/odls_bproc.c b/orte/mca/odls/bproc/odls_bproc.c index b349d7b1a7..88a5975eea 100644 --- a/orte/mca/odls/bproc/odls_bproc.c +++ b/orte/mca/odls/bproc/odls_bproc.c @@ -511,7 +511,7 @@ int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_f * @retval error */ int -orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data) +orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ) { odls_bproc_child_t *child; opal_list_item_t* item; diff --git a/orte/mca/odls/bproc/odls_bproc.h b/orte/mca/odls/bproc/odls_bproc.h index d311df244c..0a9888894e 100644 --- a/orte/mca/odls/bproc/odls_bproc.h +++ b/orte/mca/odls/bproc/odls_bproc.h @@ -57,7 +57,7 @@ int orte_odls_bproc_finalize(void); * Interface */ int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc); -int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data); +int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ); int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state); int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc_name, int32_t signal); diff --git a/orte/mca/odls/default/odls_default.h b/orte/mca/odls/default/odls_default.h index def6aa9804..61fa2a0061 100644 --- a/orte/mca/odls/default/odls_default.h +++ b/orte/mca/odls/default/odls_default.h @@ -53,7 +53,7 @@ int orte_odls_default_finalize(void); * Interface */ int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc); -int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data); +int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ); int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state); int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal); diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 8f2accbe93..45c997a1cb 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -80,8 +80,6 @@ #include "orte/mca/odls/base/odls_private.h" #include "orte/mca/odls/default/odls_default.h" -extern char **environ; - static void set_handler_default(int sig); orte_odls_base_module_t orte_odls_default_module = { @@ -450,7 +448,8 @@ static int odls_default_fork_local_proc( orte_vpid_t vpid_start, orte_vpid_t vpid_range, bool want_processor, - size_t processor) + size_t processor, + char **base_environ) { pid_t pid; orte_iof_base_io_conf_t opts; @@ -532,9 +531,9 @@ static int odls_default_fork_local_proc( /* setup base environment: copy the current environ and merge in the app context environ */ if (NULL != context->env) { - environ_copy = opal_environ_merge(environ, context->env); + environ_copy = opal_environ_merge(base_environ, context->env); } else { - environ_copy = opal_argv_copy(environ); + environ_copy = opal_argv_copy(base_environ); } /* special case handling for --prefix: this is somewhat icky, @@ -723,7 +722,7 @@ static int odls_default_fork_local_proc( * Launch all processes allocated to the current node. */ -int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data) +int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ) { int rc; orte_std_cntr_t i, j, kv, kv2, *sptr; @@ -925,7 +924,8 @@ DOFORK: OPAL_THREAD_UNLOCK(&orte_odls_default.mutex); if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start, - range, want_processor, i))) { + range, want_processor, + i, base_environ))) { ORTE_ERROR_LOG(rc); orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0); opal_condition_signal(&orte_odls_default.cond); diff --git a/orte/mca/odls/odls.h b/orte/mca/odls/odls.h index 28cdfd310d..6576bed903 100644 --- a/orte/mca/odls/odls.h +++ b/orte/mca/odls/odls.h @@ -48,7 +48,7 @@ typedef int (*orte_odls_base_module_subscribe_launch_data_fn_t)(orte_jobid_t job /** * Locally launch the provided processes */ -typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data); +typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data, char **base_environ); /** * Kill the local processes on this node diff --git a/orte/mca/odls/process/odls_process_module.c b/orte/mca/odls/process/odls_process_module.c index 4b41496084..758ed5c829 100755 --- a/orte/mca/odls/process/odls_process_module.c +++ b/orte/mca/odls/process/odls_process_module.c @@ -418,7 +418,8 @@ static int orte_odls_process_fork_local_proc( orte_vpid_t vpid_start, orte_vpid_t vpid_range, bool want_processor, - size_t processor) + size_t processor, + char **base_environ) { pid_t pid; orte_iof_base_io_conf_t opts; @@ -460,9 +461,9 @@ static int orte_odls_process_fork_local_proc( /* setup base environment: copy the current environ and merge in the app context environ */ if (NULL != context->env) { - environ_copy = opal_environ_merge(environ, context->env); + environ_copy = opal_environ_merge(base_environ, context->env); } else { - environ_copy = opal_argv_copy(environ); + environ_copy = opal_argv_copy(base_environ); } /* special case handling for --prefix: this is somewhat icky, @@ -597,7 +598,7 @@ static int orte_odls_process_fork_local_proc( * Launch all processes allocated to the current node. */ -static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data) +static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ) { int rc; orte_std_cntr_t i, j, kv, kv2, *sptr; @@ -799,7 +800,8 @@ DOFORK: OPAL_THREAD_UNLOCK(&orte_odls_process.mutex); if (ORTE_SUCCESS != (rc = orte_odls_process_fork_local_proc(app, child, start, - range, want_processor, i))) { + range, want_processor, + i, base_environ))) { ORTE_ERROR_LOG(rc); orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0); opal_condition_signal(&orte_odls_process.cond); @@ -821,7 +823,7 @@ DOFORK: return ORTE_SUCCESS; } -static int send_signal(pid_t pid, int signal) +static int send_signal(pid_t pid, int signal) { return ORTE_ERROR; } diff --git a/orte/mca/pls/base/pls_base_general_support_fns.c b/orte/mca/pls/base/pls_base_general_support_fns.c index b8b439d008..252f352b8e 100644 --- a/orte/mca/pls/base/pls_base_general_support_fns.c +++ b/orte/mca/pls/base/pls_base_general_support_fns.c @@ -47,6 +47,7 @@ static int lookup_set(char *a, char *b, char *c, int default_val, int orte_pls_base_mca_argv(int *argc, char ***argv) { lookup_set("orted", "spin", NULL, 0, "--spin", argc, argv); + lookup_set("orted", "no_daemonize", NULL, 0, "--no-daemonize", argc, argv); lookup_set("orte", "debug", NULL, 0, "--debug", argc, argv); lookup_set("orte", "debug", "daemons", 0, "--debug-daemons", argc, argv); lookup_set("orte", "debug", "daemons_file", 0, "--debug-daemons-file", argc, argv); diff --git a/orte/mca/pls/rsh/pls_rsh.h b/orte/mca/pls/rsh/pls_rsh.h index d05125ff78..60d46871b4 100644 --- a/orte/mca/pls/rsh/pls_rsh.h +++ b/orte/mca/pls/rsh/pls_rsh.h @@ -61,6 +61,7 @@ int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t); struct orte_pls_rsh_component_t { orte_pls_base_component_t super; bool debug; + bool debug_malloc; bool reap; bool assume_same_shell; int delay; diff --git a/orte/mca/pls/rsh/pls_rsh_component.c b/orte/mca/pls/rsh/pls_rsh_component.c index b3af969729..13ab6c15b8 100644 --- a/orte/mca/pls/rsh/pls_rsh_component.c +++ b/orte/mca/pls/rsh/pls_rsh_component.c @@ -109,6 +109,7 @@ orte_pls_rsh_component_t mca_pls_rsh_component = { int orte_pls_rsh_component_open(void) { int tmp; + char *ctmp; mca_base_component_t *c = &mca_pls_rsh_component.super.pls_version; /* initialize globals */ @@ -141,10 +142,27 @@ int orte_pls_rsh_component_open(void) mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp); } + /* see if we want to use malloc options to debug memory in the daemons */ + mca_base_param_reg_int_name("orte_debug", "malloc", + "Whether or not to use the malloc options to debug memory usage (Mac OS-X *only*)", + false, false, (int)false, &tmp); + mca_pls_rsh_component.debug_malloc = OPAL_INT_TO_BOOL(tmp); + mca_base_param_reg_string(c, "orted", "The command name that the rsh pls component will invoke for the ORTE daemon", false, false, "orted", &mca_pls_rsh_component.orted); + + /* see if we want to use valgrind to debug memory in the daemons */ + mca_base_param_reg_int_name("orte_debug", "valgrind", + "Whether or not to launch the orteds under valgrind (Linux *only*)", + false, false, (int)false, &tmp); + if (tmp) { + asprintf(&ctmp, "valgrind %s", mca_pls_rsh_component.orted); + free(mca_pls_rsh_component.orted); + mca_pls_rsh_component.orted = ctmp; + } + mca_base_param_reg_int(c, "priority", "Priority of the rsh pls component", false, false, 10, diff --git a/orte/mca/pls/rsh/pls_rsh_module.c b/orte/mca/pls/rsh/pls_rsh_module.c index ab4229c6fa..67cdc7fc67 100644 --- a/orte/mca/pls/rsh/pls_rsh_module.c +++ b/orte/mca/pls/rsh/pls_rsh_module.c @@ -910,7 +910,15 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) env = opal_argv_copy(environ); var = mca_base_param_environ_variable("seed",NULL,NULL); opal_setenv(var, "0", true, &env); - + + /* check for malloc debug options */ + if (mca_pls_rsh_component.debug_malloc) { + opal_setenv("MallocPreScribble", "1", true, &env); + opal_setenv("MallocScribble", "1", true, &env); + opal_setenv("MallocCheckHeapAbort", "1", true, &env); + opal_setenv("MallocBadFreeAbort", "1", true, &env); + } + /* exec the daemon */ if (mca_pls_rsh_component.debug) { param = opal_argv_join(exec_argv, ' '); diff --git a/orte/tools/orted/orted.c b/orte/tools/orted/orted.c index 62c6455cb7..cff427bfd5 100644 --- a/orte/tools/orted/orted.c +++ b/orte/tools/orted/orted.c @@ -47,6 +47,7 @@ #include "opal/util/printf.h" #include "opal/util/show_help.h" #include "opal/util/trace.h" +#include "opal/util/argv.h" #include "orte/dss/dss.h" #include "orte/class/orte_value_array.h" @@ -198,8 +199,18 @@ int main(int argc, char *argv[]) char *param; int i; - /* setup to check common command line options that just report and die */ + /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals_t)); + + /* save the environment for use when launching application processes */ + orted_globals.saved_environ = opal_argv_copy(environ); + /* clear it from any orted-related directives */ + opal_unsetenv("MallocPreScribble", &orted_globals.saved_environ); + opal_unsetenv("MallocScribble", &orted_globals.saved_environ); + opal_unsetenv("MallocCheckHeapAbort", &orted_globals.saved_environ); + opal_unsetenv("MallocBadFreeAbort", &orted_globals.saved_environ); + + /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); opal_cmd_line_create(cmd_line, orte_cmd_line_opts); if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, @@ -229,8 +240,9 @@ int main(int argc, char *argv[]) if (1000 < i) i=0; } - /* Okay, now on to serious business - * First, ensure the process info structure in instantiated and initialized + /* Okay, now on to serious business! */ + + /* Ensure the process info structure in instantiated and initialized * and set the daemon flag to true */ orte_process_info.daemon = true; @@ -565,7 +577,7 @@ static void orted_local_cb_launcher(orte_gpr_notify_data_t *data, void *user_tag /* pass the data to the orted_local_launcher and get a report on * success or failure of the launch */ - if (ORTE_SUCCESS != (rc = orte_odls.launch_local_procs(data))) { + if (ORTE_SUCCESS != (rc = orte_odls.launch_local_procs(data, orted_globals.saved_environ))) { /* if there was an error, report it and wakeup the orted */ ORTE_ERROR_LOG(rc); orted_globals.exit_condition = true; @@ -675,7 +687,7 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender, } /* launch the processes */ - if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat))) { + if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat, orted_globals.saved_environ))) { ORTE_ERROR_LOG(ret); } diff --git a/orte/tools/orted/orted.h b/orte/tools/orted/orted.h index d77b182431..c2c558cc08 100644 --- a/orte/tools/orted/orted.h +++ b/orte/tools/orted/orted.h @@ -40,6 +40,7 @@ typedef struct { char* vpid_start; char* num_procs; char* universe; + char **saved_environ; int bootproxy; int uri_pipe; opal_mutex_t mutex; diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 49c5d44b53..3917e9538a 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -236,18 +236,31 @@ opal_cmd_line_init_t cmd_line_init[] = { { "orte", "debug", NULL, 'd', NULL, "debug-devel", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Enable debugging of OpenRTE" }, + { "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0, NULL, OPAL_CMD_LINE_TYPE_INT, "Enable debugging of any OpenRTE daemons used by this application" }, + { "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, + + { "orte", "debug", "malloc", '\0', NULL, "debug-malloc", 0, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Enable debugging of OpenRTE using malloc options (Mac OS-X *only*)" }, + + { "orte", "debug", "valgrind", '\0', NULL, "debug-valgrind", 0, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Enable debugging of OpenRTE using valgrind on daemons (Linux *only*)" }, + { "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Do not detach OpenRTE daemons used by this application" }, + { "universe", NULL, NULL, '\0', NULL, "universe", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, "Set the universe name as username@hostname:universe_name for this application" }, + { NULL, NULL, NULL, '\0', NULL, "tmpdir", 1, &orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING, "Set the root for the session directory tree for orterun ONLY" }, @@ -361,6 +374,32 @@ int orterun(int argc, char *argv[]) } free(tmp); } + id = mca_base_param_reg_int_name("orte_debug", "malloc", + "Whether or not to use the malloc options to debug memory usage (Mac OS-X *only*)", + false, false, (int)false, &iparam); + if (iparam) { + char *tmp = mca_base_param_environ_variable("orte", "debug", "malloc"); + if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) { + opal_show_help("help-orterun.txt", "orterun:environ", false, + orterun_basename, tmp, "1", rc); + free(tmp); + return rc; + } + free(tmp); + } + id = mca_base_param_reg_int_name("orte_debug", "valgrind", + "Whether or not to launch the orteds under valgrind (Linux *only*)", + false, false, (int)false, &iparam); + if (iparam) { + char *tmp = mca_base_param_environ_variable("orte", "debug", "valgrind"); + if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) { + opal_show_help("help-orterun.txt", "orterun:environ", false, + orterun_basename, tmp, "1", rc); + free(tmp); + return rc; + } + free(tmp); + } id = mca_base_param_reg_int_name("orte", "debug", "Top-level ORTE debug switch", false, false, 0, &iparam);