1
1

Add a couple of options to orterun that support debugging of daemons for memory corruption.

Ensure that the environment provided to local application processes isn't "polluted" by the orteds

This commit was SVN r12087.
Этот коммит содержится в:
Ralph Castain 2006-10-11 15:18:57 +00:00
родитель 5086207c3c
Коммит 27e305347c
13 изменённых файлов: 105 добавлений и 23 удалений

Просмотреть файл

@ -511,7 +511,7 @@ int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_f
* @retval error
*/
int
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data)
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
{
odls_bproc_child_t *child;
opal_list_item_t* item;

Просмотреть файл

@ -57,7 +57,7 @@ int orte_odls_bproc_finalize(void);
* Interface
*/
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data);
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc_name, int32_t signal);

Просмотреть файл

@ -53,7 +53,7 @@ int orte_odls_default_finalize(void);
* Interface
*/
int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data);
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state);
int orte_odls_default_signal_local_procs(const orte_process_name_t *proc,
int32_t signal);

Просмотреть файл

@ -80,8 +80,6 @@
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/odls/default/odls_default.h"
extern char **environ;
static void set_handler_default(int sig);
orte_odls_base_module_t orte_odls_default_module = {
@ -450,7 +448,8 @@ static int odls_default_fork_local_proc(
orte_vpid_t vpid_start,
orte_vpid_t vpid_range,
bool want_processor,
size_t processor)
size_t processor,
char **base_environ)
{
pid_t pid;
orte_iof_base_io_conf_t opts;
@ -532,9 +531,9 @@ static int odls_default_fork_local_proc(
/* setup base environment: copy the current environ and merge
in the app context environ */
if (NULL != context->env) {
environ_copy = opal_environ_merge(environ, context->env);
environ_copy = opal_environ_merge(base_environ, context->env);
} else {
environ_copy = opal_argv_copy(environ);
environ_copy = opal_argv_copy(base_environ);
}
/* special case handling for --prefix: this is somewhat icky,
@ -723,7 +722,7 @@ static int odls_default_fork_local_proc(
* Launch all processes allocated to the current node.
*/
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
{
int rc;
orte_std_cntr_t i, j, kv, kv2, *sptr;
@ -925,7 +924,8 @@ DOFORK:
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start,
range, want_processor, i))) {
range, want_processor,
i, base_environ))) {
ORTE_ERROR_LOG(rc);
orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0);
opal_condition_signal(&orte_odls_default.cond);

Просмотреть файл

@ -48,7 +48,7 @@ typedef int (*orte_odls_base_module_subscribe_launch_data_fn_t)(orte_jobid_t job
/**
* Locally launch the provided processes
*/
typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data);
typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data, char **base_environ);
/**
* Kill the local processes on this node

Просмотреть файл

@ -418,7 +418,8 @@ static int orte_odls_process_fork_local_proc(
orte_vpid_t vpid_start,
orte_vpid_t vpid_range,
bool want_processor,
size_t processor)
size_t processor,
char **base_environ)
{
pid_t pid;
orte_iof_base_io_conf_t opts;
@ -460,9 +461,9 @@ static int orte_odls_process_fork_local_proc(
/* setup base environment: copy the current environ and merge
in the app context environ */
if (NULL != context->env) {
environ_copy = opal_environ_merge(environ, context->env);
environ_copy = opal_environ_merge(base_environ, context->env);
} else {
environ_copy = opal_argv_copy(environ);
environ_copy = opal_argv_copy(base_environ);
}
/* special case handling for --prefix: this is somewhat icky,
@ -597,7 +598,7 @@ static int orte_odls_process_fork_local_proc(
* Launch all processes allocated to the current node.
*/
static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
{
int rc;
orte_std_cntr_t i, j, kv, kv2, *sptr;
@ -799,7 +800,8 @@ DOFORK:
OPAL_THREAD_UNLOCK(&orte_odls_process.mutex);
if (ORTE_SUCCESS != (rc = orte_odls_process_fork_local_proc(app, child, start,
range, want_processor, i))) {
range, want_processor,
i, base_environ))) {
ORTE_ERROR_LOG(rc);
orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0);
opal_condition_signal(&orte_odls_process.cond);
@ -821,7 +823,7 @@ DOFORK:
return ORTE_SUCCESS;
}
static int send_signal(pid_t pid, int signal)
static int send_signal(pid_t pid, int signal)
{
return ORTE_ERROR;
}

Просмотреть файл

@ -47,6 +47,7 @@ static int lookup_set(char *a, char *b, char *c, int default_val,
int orte_pls_base_mca_argv(int *argc, char ***argv)
{
lookup_set("orted", "spin", NULL, 0, "--spin", argc, argv);
lookup_set("orted", "no_daemonize", NULL, 0, "--no-daemonize", argc, argv);
lookup_set("orte", "debug", NULL, 0, "--debug", argc, argv);
lookup_set("orte", "debug", "daemons", 0, "--debug-daemons", argc, argv);
lookup_set("orte", "debug", "daemons_file", 0, "--debug-daemons-file", argc, argv);

Просмотреть файл

@ -61,6 +61,7 @@ int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);
struct orte_pls_rsh_component_t {
orte_pls_base_component_t super;
bool debug;
bool debug_malloc;
bool reap;
bool assume_same_shell;
int delay;

Просмотреть файл

@ -109,6 +109,7 @@ orte_pls_rsh_component_t mca_pls_rsh_component = {
int orte_pls_rsh_component_open(void)
{
int tmp;
char *ctmp;
mca_base_component_t *c = &mca_pls_rsh_component.super.pls_version;
/* initialize globals */
@ -141,10 +142,27 @@ int orte_pls_rsh_component_open(void)
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
}
/* see if we want to use malloc options to debug memory in the daemons */
mca_base_param_reg_int_name("orte_debug", "malloc",
"Whether or not to use the malloc options to debug memory usage (Mac OS-X *only*)",
false, false, (int)false, &tmp);
mca_pls_rsh_component.debug_malloc = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_string(c, "orted",
"The command name that the rsh pls component will invoke for the ORTE daemon",
false, false, "orted",
&mca_pls_rsh_component.orted);
/* see if we want to use valgrind to debug memory in the daemons */
mca_base_param_reg_int_name("orte_debug", "valgrind",
"Whether or not to launch the orteds under valgrind (Linux *only*)",
false, false, (int)false, &tmp);
if (tmp) {
asprintf(&ctmp, "valgrind %s", mca_pls_rsh_component.orted);
free(mca_pls_rsh_component.orted);
mca_pls_rsh_component.orted = ctmp;
}
mca_base_param_reg_int(c, "priority",
"Priority of the rsh pls component",
false, false, 10,

Просмотреть файл

@ -910,7 +910,15 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
env = opal_argv_copy(environ);
var = mca_base_param_environ_variable("seed",NULL,NULL);
opal_setenv(var, "0", true, &env);
/* check for malloc debug options */
if (mca_pls_rsh_component.debug_malloc) {
opal_setenv("MallocPreScribble", "1", true, &env);
opal_setenv("MallocScribble", "1", true, &env);
opal_setenv("MallocCheckHeapAbort", "1", true, &env);
opal_setenv("MallocBadFreeAbort", "1", true, &env);
}
/* exec the daemon */
if (mca_pls_rsh_component.debug) {
param = opal_argv_join(exec_argv, ' ');

Просмотреть файл

@ -47,6 +47,7 @@
#include "opal/util/printf.h"
#include "opal/util/show_help.h"
#include "opal/util/trace.h"
#include "opal/util/argv.h"
#include "orte/dss/dss.h"
#include "orte/class/orte_value_array.h"
@ -198,8 +199,18 @@ int main(int argc, char *argv[])
char *param;
int i;
/* setup to check common command line options that just report and die */
/* initialize the globals */
memset(&orted_globals, 0, sizeof(orted_globals_t));
/* save the environment for use when launching application processes */
orted_globals.saved_environ = opal_argv_copy(environ);
/* clear it from any orted-related directives */
opal_unsetenv("MallocPreScribble", &orted_globals.saved_environ);
opal_unsetenv("MallocScribble", &orted_globals.saved_environ);
opal_unsetenv("MallocCheckHeapAbort", &orted_globals.saved_environ);
opal_unsetenv("MallocBadFreeAbort", &orted_globals.saved_environ);
/* setup to check common command line options that just report and die */
cmd_line = OBJ_NEW(opal_cmd_line_t);
opal_cmd_line_create(cmd_line, orte_cmd_line_opts);
if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
@ -229,8 +240,9 @@ int main(int argc, char *argv[])
if (1000 < i) i=0;
}
/* Okay, now on to serious business
* First, ensure the process info structure in instantiated and initialized
/* Okay, now on to serious business! */
/* Ensure the process info structure in instantiated and initialized
* and set the daemon flag to true
*/
orte_process_info.daemon = true;
@ -565,7 +577,7 @@ static void orted_local_cb_launcher(orte_gpr_notify_data_t *data, void *user_tag
/* pass the data to the orted_local_launcher and get a report on
* success or failure of the launch
*/
if (ORTE_SUCCESS != (rc = orte_odls.launch_local_procs(data))) {
if (ORTE_SUCCESS != (rc = orte_odls.launch_local_procs(data, orted_globals.saved_environ))) {
/* if there was an error, report it and wakeup the orted */
ORTE_ERROR_LOG(rc);
orted_globals.exit_condition = true;
@ -675,7 +687,7 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
}
/* launch the processes */
if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat))) {
if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(ndat, orted_globals.saved_environ))) {
ORTE_ERROR_LOG(ret);
}

Просмотреть файл

@ -40,6 +40,7 @@ typedef struct {
char* vpid_start;
char* num_procs;
char* universe;
char **saved_environ;
int bootproxy;
int uri_pipe;
opal_mutex_t mutex;

Просмотреть файл

@ -236,18 +236,31 @@ opal_cmd_line_init_t cmd_line_init[] = {
{ "orte", "debug", NULL, 'd', NULL, "debug-devel", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE" },
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Enable debugging of any OpenRTE daemons used by this application" },
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
{ "orte", "debug", "malloc", '\0', NULL, "debug-malloc", 0,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Enable debugging of OpenRTE using malloc options (Mac OS-X *only*)" },
{ "orte", "debug", "valgrind", '\0', NULL, "debug-valgrind", 0,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Enable debugging of OpenRTE using valgrind on daemons (Linux *only*)" },
{ "orte", "no_daemonize", NULL, '\0', NULL, "no-daemonize", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Do not detach OpenRTE daemons used by this application" },
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Set the universe name as username@hostname:universe_name for this application" },
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree for orterun ONLY" },
@ -361,6 +374,32 @@ int orterun(int argc, char *argv[])
}
free(tmp);
}
id = mca_base_param_reg_int_name("orte_debug", "malloc",
"Whether or not to use the malloc options to debug memory usage (Mac OS-X *only*)",
false, false, (int)false, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("orte", "debug", "malloc");
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
id = mca_base_param_reg_int_name("orte_debug", "valgrind",
"Whether or not to launch the orteds under valgrind (Linux *only*)",
false, false, (int)false, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("orte", "debug", "valgrind");
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
id = mca_base_param_reg_int_name("orte", "debug",
"Top-level ORTE debug switch",
false, false, 0, &iparam);