1
1

Update the debugger interface per email thread with Jeff and Brian. Handoff to them for final test and validation

This commit was SVN r18670.
Этот коммит содержится в:
Ralph Castain 2008-06-18 15:28:46 +00:00
родитель 558e68088c
Коммит 282a220e7e
9 изменённых файлов: 202 добавлений и 170 удалений

Просмотреть файл

@ -65,6 +65,9 @@
#include "ompi/datatype/datatype.h"
#include "ompi/include/mpi.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#if defined(OMPI_MSGQ_DLL)
/* This variable is old/deprecated -- the mpimsgq_dll_locations[]
method is preferred because it's more flexible */
@ -107,6 +110,8 @@ OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate=0;
/* we don't believe we need MPIR_being_debugged here */
/* Check for a file in few dirrect ways for portability */
static void check(char *dir, char *file, char **locations)
{
@ -144,30 +149,33 @@ static void check(char *dir, char *file, char **locations)
*/
void ompi_wait_for_debugger(void)
{
int i, wait_for_debugger, wait_for_tv;
int i, debugger, rc;
char *a, *b, **dirs;
opal_buffer_t buf;
/* Do we need to wait for a TotalView-like debugger? */
/* are we being debugged by a TotalView-like debugger? */
mca_base_param_reg_int_name("ompi",
"mpi_wait_for_debugger",
"mpi_being_debugged",
"Whether the MPI application "
"should wait for a debugger or not",
"is being debugged (default: false)",
false, false, (int) false,
&wait_for_debugger);
mca_base_param_reg_int_name("ompi",
"mpi_wait_for_totalview",
"Deprecated synonym for mpi_wait_for_debugger",
false, false, (int) false,
&wait_for_tv);
wait_for_debugger |= wait_for_tv;
&debugger);
if (!debugger) {
/* if not, just return */
return;
}
/* if we are being debugged, then we need to find
* the correct plug-in
*/
a = strdup(opal_install_dirs.pkglibdir);
mca_base_param_reg_string_name("ompi",
"debugger_dll_path",
"List of directories where MPI_INIT should search for debugger plugins",
false, false, a, &b);
free(a);
/* Search the directory for MPI debugger DLLs */
if (NULL != b) {
dirs = opal_argv_split(b, ':');
@ -176,23 +184,53 @@ void ompi_wait_for_debugger(void)
check(dirs[i], OMPI_MSGQ_DLL_PREFIX, mpimsgq_dll_locations);
}
}
/* If we're waiting for the debugger, then, well, wait for it. :-) */
if (wait_for_debugger) {
/* RHC: the following is a temporary hack until we figure
* out how to resolve the problem of where to
* instance the MPIR* variables so that multiple
* launchers can access them
/* only the rank=0 proc waits for the debugger - everyone else will just
* spin in the barrier in mpi_init until rank=0 joins them
*/
if (0 != ORTE_PROC_MY_NAME->vpid) {
return;
}
/* we have to support at least two ways of completing the
* debug attachment - either we will get a message from
* the HNP telling us it is okay to release, or the debugger
* itself will reach into us and set a gate.
*
* First, attempt to get a message-based release
*/
OBJ_CONSTRUCT(&buf, opal_buffer_t);
rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0);
OBJ_DESTRUCT(&buf); /* don't care about contents of message */
if (rc > 0) {
/* message received - we can go! */
return;
} else if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the recv isn't supported, then we fall back
* to the alternative method for waiting
*/
while (MPIR_debug_gate == 0) {
goto spin_wait;
} else {
/* if it failed for some other reason, then we are
* in trouble - for now, just report the problem
* and give up waiting
*/
opal_output(0, "Debugger_attach[rank=%ld]: could not wait for debugger - error %s!",
(long)ORTE_PROC_MY_NAME->vpid, ORTE_ERROR_NAME(rc));
return;
}
spin_wait:
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
#if defined(__WINDOWS__)
Sleep(100); /* milliseconds */
Sleep(100); /* milliseconds */
#elif defined(HAVE_USLEEP)
usleep(100000); /* microseconds */
usleep(100000); /* microseconds */
#else
sleep(1); /* seconds */
sleep(1); /* seconds */
#endif
}
}
}

Просмотреть файл

@ -592,12 +592,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
orte_process_info.nodename);
}
/* wait for everyone to reach this point */
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
error = "orte_grpcomm_barrier failed";
goto error;
}
/* wire up the oob interface, if requested. Do this here because
it will go much faster before the event library is switched
into non-blocking mode */
@ -606,11 +600,31 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
goto error;
}
/* Do we need to wait for a debugger? */
ompi_wait_for_debugger();
/* check for timing request - get stop time and report elapsed
time if so, then start the clock again */
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time from modex thru complete oob wireup %ld usec",
(long)ORTE_PROC_MY_NAME->vpid,
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));
gettimeofday(&ompistart, NULL);
}
/* wait for everyone to reach this point */
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
error = "orte_grpcomm_barrier failed";
goto error;
}
/* check for timing request - get stop time and report elapsed
time if so, then start the clock again */
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time from stage 2 cast to complete oob wireup %ld usec",
opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
(long)ORTE_PROC_MY_NAME->vpid,
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));
@ -772,13 +786,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
ompi_mpi_initialized = true;
/* Do we need to wait for a debugger? */
ompi_wait_for_debugger();
/* check for timing request - get stop time and report elapsed time if so */
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time from oob wireup to complete mpi_init %ld usec",
opal_output(0, "ompi_mpi_init[%ld]: time from barrier p to complete mpi_init %ld usec",
(long)ORTE_PROC_MY_NAME->vpid,
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));

Просмотреть файл

@ -50,7 +50,6 @@
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/util/totalview.h"
#include "orte/util/nidmap.h"
#include "orte/mca/plm/base/plm_private.h"
@ -214,9 +213,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
return rc;
}
/* init any debuggers */
orte_totalview_init_after_spawn(job);
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch completed for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -103,6 +103,8 @@ BEGIN_C_DECLS
/* show help */
#define ORTE_RML_TAG_SHOW_HELP 31
/* debugger release */
#define ORTE_RML_TAG_DEBUGGER_RELEASE 32
#define ORTE_RML_TAG_MAX 100

Просмотреть файл

@ -39,6 +39,8 @@ endif # OMPI_INSTALL_BINARIES
orterun_SOURCES = \
main.c \
orterun.c \
orterun.h
orterun.h \
debuggers.h \
debuggers.c
orterun_LDADD = $(top_builddir)/orte/libopen-rte.la

Просмотреть файл

@ -53,37 +53,34 @@
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/opal_environ.h"
#include "orte/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_list.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/util/totalview.h"
#include "debuggers.h"
/* +++ begin MPICH/TotalView interface definitions */
#define MPIR_DEBUG_SPAWNED 1
#define MPIR_DEBUG_ABORTING 2
/* +++ begin MPICH/TotalView std debugger interface definitions */
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
struct MPIR_PROCDESC *MPIR_proctable = NULL;
int MPIR_proctable_size = 0;
int MPIR_being_debugged = 0;
int MPIR_force_to_main = 0;
bool MPIR_being_debugged = false;
volatile int MPIR_debug_state = 0;
volatile int MPIR_i_am_starter = 0;
volatile int MPIR_acquired_pre_main = 0;
volatile int MPIR_partial_attach_ok = 1;
/* --- end MPICH/TotalView interface definitions */
/* --- end MPICH/TotalView std debugger interface definitions */
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
@ -94,7 +91,7 @@ static void dump(void)
DUMP_INT(MPIR_being_debugged);
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_acquired_pre_main);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
@ -365,31 +362,29 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void orte_totalview_init_before_spawn(void)
void orte_debugger_init_before_spawn(orte_job_t *jdata)
{
if (MPIR_DEBUG_SPAWNED == MPIR_being_debugged) {
char *s;
orte_app_context_t **apps;
orte_std_cntr_t i;
int value;
char *s;
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
if (mca_base_param_reg_int_name("ompi", "mpi_wait_for_totalview",
"Whether the MPI application should wait for a debugger or not",
false, false, (int)false, &value) < 0) {
opal_output(0, "Error: mca_base_param_reg_int_name\n");
}
/* push mca parameter into the environment (not done automatically?) */
s = mca_base_param_environ_variable("ompi", "mpi_wait_for_totalview", NULL);
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
opal_output(0, "Error: Can't setenv %s\n", s);
}
free(s);
if (!MPIR_being_debugged) {
/* not being debugged */
return;
}
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
apps = (orte_app_context_t**)jdata->apps->addr;
/* tell the procs they are being debugged */
s = mca_base_param_environ_variable("ompi", "mpi_being_debugged", NULL);
for (i=0; i < jdata->num_apps; i++) {
opal_setenv(s, "1", true, &apps[i]->env);
}
free(s);
}
@ -401,80 +396,65 @@ void orte_totalview_init_before_spawn(void)
*
* @param jobid The jobid returned by spawn.
*/
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
void orte_debugger_init_after_spawn(orte_job_t *jdata)
{
orte_job_t *jdata;
orte_proc_t **procs;
orte_app_context_t *appctx, **apps;
orte_vpid_t i, j;
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
if (!MPIR_being_debugged) {
/* not being debugged */
return;
}
if (MPIR_proctable) {
/* already initialized */
return;
}
if (0) { /* debugging daemons <<-- needs work */
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for daemons\n");
}
} else {
/*
* Debugging applications or not being debugged.
*
* Either way, fill in the proc table for the application
* processes in case someone attaches later.
*/
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for applications\n");
}
MPIR_debug_state = 1;
/* Get the job data for this job */
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
opal_output(0, "Error: Can't get job data\n");
return;
/* fill in the proc table for the application processes */
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for applications\n");
}
MPIR_debug_state = 1;
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
opal_output(0, "Error: Out of memory\n");
return;
}
/* initialize MPIR_proctable */
i=0;
procs = (orte_proc_t**)jdata->procs->addr;
apps = (orte_app_context_t**)jdata->apps->addr;
for (j=0; j < jdata->num_procs; j++) {
if (NULL == procs[j]) {
opal_output(0, "Error: undefined proc at position %ld\n", (long)j);
}
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
opal_output(0, "Error: Out of memory\n");
}
/* initialize MPIR_proctable */
i=0;
procs = (orte_proc_t**)jdata->procs->addr;
apps = (orte_app_context_t**)jdata->apps->addr;
for (j=0; j < jdata->num_procs; j++) {
if (NULL == procs[j]) {
opal_output(0, "Error: undefined proc at position %ld\n", (long)j);
}
appctx = apps[procs[j]->app_idx];
MPIR_proctable[i].host_name = strdup(procs[j]->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = procs[j]->pid;
i++;
}
appctx = apps[procs[j]->app_idx];
MPIR_proctable[i].host_name = strdup(procs[j]->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = procs[j]->pid;
i++;
}
if (orte_debug_flag) {
@ -482,6 +462,15 @@ void orte_totalview_init_after_spawn(orte_jobid_t jobid)
}
(void) MPIR_Breakpoint();
/* send a message to rank=0 to release it */
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
rank0.jobid = jdata->jobid;
rank0.vpid = 0;
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
}
OBJ_DESTRUCT(&buf);
}
@ -489,10 +478,11 @@ void orte_totalview_init_after_spawn(orte_jobid_t jobid)
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void orte_totalview_finalize(void)
void orte_debugger_finalize(void)
{
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}

Просмотреть файл

@ -17,33 +17,23 @@
* $HEADER$
*/
#ifndef ORTE_TOTALVIEW_H
#define ORTE_TOTALVIEW_H
#ifndef ORTE_DEBUGGERS_H
#define ORTE_DEBUGGERS_H
#include "orte_config.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
ORTE_DECLSPEC void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
ORTE_DECLSPEC void orte_totalview_init_before_spawn(void);
ORTE_DECLSPEC void orte_totalview_init_after_spawn(orte_jobid_t jobid);
ORTE_DECLSPEC void orte_totalview_finalize(void);
void orte_debugger_init_before_spawn(orte_job_t *jdata);
void orte_debugger_init_after_spawn(orte_job_t *jdata);
void orte_debugger_finalize(void);
ORTE_DECLSPEC extern void *MPIR_Breakpoint(void);
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
ORTE_DECLSPEC extern struct MPIR_PROCDESC *MPIR_proctable;
ORTE_DECLSPEC extern int MPIR_proctable_size;
ORTE_DECLSPEC extern int MPIR_being_debugged;
ORTE_DECLSPEC extern volatile int MPIR_debug_state;
extern void *MPIR_Breakpoint(void);
END_C_DECLS
#endif /* ORTE_TOTALVIEW_H */
#endif /* ORTE_DEBUGGERS_H */

Просмотреть файл

@ -71,7 +71,6 @@
#include "orte/util/pre_condition_transports.h"
#include "orte/util/session_dir.h"
#include "orte/util/name_fns.h"
#include "orte/util/totalview.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
@ -90,6 +89,7 @@
/* ensure I can behave like a daemon */
#include "orte/orted/orted.h"
#include "debuggers.h"
#include "orterun.h"
/*
@ -524,7 +524,9 @@ int orterun(int argc, char *argv[])
signal_forward_callback, &sigusr2_handler);
opal_signal_add(&sigusr2_handler, NULL);
#endif /* __WINDOWS__ */
orte_totalview_init_before_spawn();
/* setup for debugging, if we are doing so */
orte_debugger_init_before_spawn(jdata);
/* setup an event we can wait for that will tell
* us to terminate - both normal and abnormal
@ -542,6 +544,9 @@ int orterun(int argc, char *argv[])
/* Spawn the job */
rc = orte_plm.spawn(jdata);
/* complete debugger interface, if we are debugging */
orte_debugger_init_after_spawn(jdata);
/* now wait until the termination event fires */
opal_event_dispatch();
@ -604,7 +609,7 @@ static void job_completed(int trigpipe, short event, void *arg)
}
/* if the debuggers were run, clean up */
orte_totalview_finalize();
orte_debugger_finalize();
/* the job is complete - now setup an event that will
* trigger when the orteds are gone and tell the orteds that it is

Просмотреть файл

@ -47,8 +47,7 @@ headers += \
util/hostfile/hostfile_lex.h \
util/dash_host/dash_host.h \
util/comm/comm.h \
util/nidmap.h \
util/totalview.h
util/nidmap.h
libopen_rte_la_SOURCES += \
util/context_fns.c \
@ -58,6 +57,5 @@ libopen_rte_la_SOURCES += \
util/hostfile/hostfile.c \
util/dash_host/dash_host.c \
util/comm/comm.c \
util/nidmap.c \
util/totalview.c
util/nidmap.c
endif