Update the debugger interface per email thread with Jeff and Brian. Handoff to them for final test and validation
This commit was SVN r18670.
Этот коммит содержится в:
родитель
558e68088c
Коммит
282a220e7e
@ -65,6 +65,9 @@
|
|||||||
#include "ompi/datatype/datatype.h"
|
#include "ompi/datatype/datatype.h"
|
||||||
#include "ompi/include/mpi.h"
|
#include "ompi/include/mpi.h"
|
||||||
|
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/mca/rml/rml.h"
|
||||||
|
|
||||||
#if defined(OMPI_MSGQ_DLL)
|
#if defined(OMPI_MSGQ_DLL)
|
||||||
/* This variable is old/deprecated -- the mpimsgq_dll_locations[]
|
/* This variable is old/deprecated -- the mpimsgq_dll_locations[]
|
||||||
method is preferred because it's more flexible */
|
method is preferred because it's more flexible */
|
||||||
@ -107,6 +110,8 @@ OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
|
|||||||
|
|
||||||
OMPI_DECLSPEC volatile int MPIR_debug_gate=0;
|
OMPI_DECLSPEC volatile int MPIR_debug_gate=0;
|
||||||
|
|
||||||
|
/* we don't believe we need MPIR_being_debugged here */
|
||||||
|
|
||||||
/* Check for a file in few dirrect ways for portability */
|
/* Check for a file in few dirrect ways for portability */
|
||||||
static void check(char *dir, char *file, char **locations)
|
static void check(char *dir, char *file, char **locations)
|
||||||
{
|
{
|
||||||
@ -144,30 +149,33 @@ static void check(char *dir, char *file, char **locations)
|
|||||||
*/
|
*/
|
||||||
void ompi_wait_for_debugger(void)
|
void ompi_wait_for_debugger(void)
|
||||||
{
|
{
|
||||||
int i, wait_for_debugger, wait_for_tv;
|
int i, debugger, rc;
|
||||||
char *a, *b, **dirs;
|
char *a, *b, **dirs;
|
||||||
|
opal_buffer_t buf;
|
||||||
|
|
||||||
/* Do we need to wait for a TotalView-like debugger? */
|
/* are we being debugged by a TotalView-like debugger? */
|
||||||
mca_base_param_reg_int_name("ompi",
|
mca_base_param_reg_int_name("ompi",
|
||||||
"mpi_wait_for_debugger",
|
"mpi_being_debugged",
|
||||||
"Whether the MPI application "
|
"Whether the MPI application "
|
||||||
"should wait for a debugger or not",
|
"is being debugged (default: false)",
|
||||||
false, false, (int) false,
|
false, false, (int) false,
|
||||||
&wait_for_debugger);
|
&debugger);
|
||||||
mca_base_param_reg_int_name("ompi",
|
|
||||||
"mpi_wait_for_totalview",
|
if (!debugger) {
|
||||||
"Deprecated synonym for mpi_wait_for_debugger",
|
/* if not, just return */
|
||||||
false, false, (int) false,
|
return;
|
||||||
&wait_for_tv);
|
}
|
||||||
wait_for_debugger |= wait_for_tv;
|
|
||||||
|
/* if we are being debugged, then we need to find
|
||||||
|
* the correct plug-in
|
||||||
|
*/
|
||||||
a = strdup(opal_install_dirs.pkglibdir);
|
a = strdup(opal_install_dirs.pkglibdir);
|
||||||
mca_base_param_reg_string_name("ompi",
|
mca_base_param_reg_string_name("ompi",
|
||||||
"debugger_dll_path",
|
"debugger_dll_path",
|
||||||
"List of directories where MPI_INIT should search for debugger plugins",
|
"List of directories where MPI_INIT should search for debugger plugins",
|
||||||
false, false, a, &b);
|
false, false, a, &b);
|
||||||
free(a);
|
free(a);
|
||||||
|
|
||||||
/* Search the directory for MPI debugger DLLs */
|
/* Search the directory for MPI debugger DLLs */
|
||||||
if (NULL != b) {
|
if (NULL != b) {
|
||||||
dirs = opal_argv_split(b, ':');
|
dirs = opal_argv_split(b, ':');
|
||||||
@ -176,23 +184,53 @@ void ompi_wait_for_debugger(void)
|
|||||||
check(dirs[i], OMPI_MSGQ_DLL_PREFIX, mpimsgq_dll_locations);
|
check(dirs[i], OMPI_MSGQ_DLL_PREFIX, mpimsgq_dll_locations);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If we're waiting for the debugger, then, well, wait for it. :-) */
|
/* only the rank=0 proc waits for the debugger - everyone else will just
|
||||||
if (wait_for_debugger) {
|
* spin in the barrier in mpi_init until rank=0 joins them
|
||||||
/* RHC: the following is a temporary hack until we figure
|
*/
|
||||||
* out how to resolve the problem of where to
|
if (0 != ORTE_PROC_MY_NAME->vpid) {
|
||||||
* instance the MPIR* variables so that multiple
|
return;
|
||||||
* launchers can access them
|
}
|
||||||
|
|
||||||
|
/* we have to support at least two ways of completing the
|
||||||
|
* debug attachment - either we will get a message from
|
||||||
|
* the HNP telling us it is okay to release, or the debugger
|
||||||
|
* itself will reach into us and set a gate.
|
||||||
|
*
|
||||||
|
* First, attempt to get a message-based release
|
||||||
|
*/
|
||||||
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||||
|
rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0);
|
||||||
|
OBJ_DESTRUCT(&buf); /* don't care about contents of message */
|
||||||
|
|
||||||
|
if (rc > 0) {
|
||||||
|
/* message received - we can go! */
|
||||||
|
return;
|
||||||
|
} else if (ORTE_ERR_NOT_SUPPORTED == rc) {
|
||||||
|
/* if the recv isn't supported, then we fall back
|
||||||
|
* to the alternative method for waiting
|
||||||
*/
|
*/
|
||||||
while (MPIR_debug_gate == 0) {
|
goto spin_wait;
|
||||||
|
} else {
|
||||||
|
/* if it failed for some other reason, then we are
|
||||||
|
* in trouble - for now, just report the problem
|
||||||
|
* and give up waiting
|
||||||
|
*/
|
||||||
|
opal_output(0, "Debugger_attach[rank=%ld]: could not wait for debugger - error %s!",
|
||||||
|
(long)ORTE_PROC_MY_NAME->vpid, ORTE_ERROR_NAME(rc));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
spin_wait:
|
||||||
|
/* spin until debugger attaches and releases us */
|
||||||
|
while (MPIR_debug_gate == 0) {
|
||||||
#if defined(__WINDOWS__)
|
#if defined(__WINDOWS__)
|
||||||
Sleep(100); /* milliseconds */
|
Sleep(100); /* milliseconds */
|
||||||
#elif defined(HAVE_USLEEP)
|
#elif defined(HAVE_USLEEP)
|
||||||
usleep(100000); /* microseconds */
|
usleep(100000); /* microseconds */
|
||||||
#else
|
#else
|
||||||
sleep(1); /* seconds */
|
sleep(1); /* seconds */
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -592,12 +592,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
orte_process_info.nodename);
|
orte_process_info.nodename);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* wait for everyone to reach this point */
|
|
||||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
|
||||||
error = "orte_grpcomm_barrier failed";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* wire up the oob interface, if requested. Do this here because
|
/* wire up the oob interface, if requested. Do this here because
|
||||||
it will go much faster before the event library is switched
|
it will go much faster before the event library is switched
|
||||||
into non-blocking mode */
|
into non-blocking mode */
|
||||||
@ -606,11 +600,31 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Do we need to wait for a debugger? */
|
||||||
|
ompi_wait_for_debugger();
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed
|
||||||
|
time if so, then start the clock again */
|
||||||
|
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||||
|
gettimeofday(&ompistop, NULL);
|
||||||
|
opal_output(0, "ompi_mpi_init[%ld]: time from modex thru complete oob wireup %ld usec",
|
||||||
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
|
gettimeofday(&ompistart, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* wait for everyone to reach this point */
|
||||||
|
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||||
|
error = "orte_grpcomm_barrier failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
/* check for timing request - get stop time and report elapsed
|
/* check for timing request - get stop time and report elapsed
|
||||||
time if so, then start the clock again */
|
time if so, then start the clock again */
|
||||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||||
gettimeofday(&ompistop, NULL);
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init[%ld]: time from stage 2 cast to complete oob wireup %ld usec",
|
opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
|
||||||
(long)ORTE_PROC_MY_NAME->vpid,
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
@ -772,13 +786,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
ompi_mpi_initialized = true;
|
ompi_mpi_initialized = true;
|
||||||
|
|
||||||
/* Do we need to wait for a debugger? */
|
|
||||||
ompi_wait_for_debugger();
|
|
||||||
|
|
||||||
/* check for timing request - get stop time and report elapsed time if so */
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||||
gettimeofday(&ompistop, NULL);
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init[%ld]: time from oob wireup to complete mpi_init %ld usec",
|
opal_output(0, "ompi_mpi_init[%ld]: time from barrier p to complete mpi_init %ld usec",
|
||||||
(long)ORTE_PROC_MY_NAME->vpid,
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
|
@ -50,7 +50,6 @@
|
|||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/totalview.h"
|
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
|
|
||||||
#include "orte/mca/plm/base/plm_private.h"
|
#include "orte/mca/plm/base/plm_private.h"
|
||||||
@ -214,9 +213,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* init any debuggers */
|
|
||||||
orte_totalview_init_after_spawn(job);
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:launch completed for job %s",
|
"%s plm:base:launch completed for job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -103,6 +103,8 @@ BEGIN_C_DECLS
|
|||||||
/* show help */
|
/* show help */
|
||||||
#define ORTE_RML_TAG_SHOW_HELP 31
|
#define ORTE_RML_TAG_SHOW_HELP 31
|
||||||
|
|
||||||
|
/* debugger release */
|
||||||
|
#define ORTE_RML_TAG_DEBUGGER_RELEASE 32
|
||||||
|
|
||||||
#define ORTE_RML_TAG_MAX 100
|
#define ORTE_RML_TAG_MAX 100
|
||||||
|
|
||||||
|
@ -39,6 +39,8 @@ endif # OMPI_INSTALL_BINARIES
|
|||||||
orterun_SOURCES = \
|
orterun_SOURCES = \
|
||||||
main.c \
|
main.c \
|
||||||
orterun.c \
|
orterun.c \
|
||||||
orterun.h
|
orterun.h \
|
||||||
|
debuggers.h \
|
||||||
|
debuggers.c
|
||||||
|
|
||||||
orterun_LDADD = $(top_builddir)/orte/libopen-rte.la
|
orterun_LDADD = $(top_builddir)/orte/libopen-rte.la
|
||||||
|
@ -53,37 +53,34 @@
|
|||||||
#endif /* HAVE_UNISTD_H */
|
#endif /* HAVE_UNISTD_H */
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
|
||||||
#include "opal/util/opal_environ.h"
|
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/path.h"
|
#include "opal/util/path.h"
|
||||||
#include "opal/util/os_path.h"
|
#include "opal/util/os_path.h"
|
||||||
#include "opal/class/opal_list.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
#include "opal/mca/base/base.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/plm/plm_types.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/rmaps/rmaps.h"
|
|
||||||
#include "orte/runtime/runtime.h"
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
|
|
||||||
#include "orte/util/totalview.h"
|
#include "debuggers.h"
|
||||||
|
|
||||||
/* +++ begin MPICH/TotalView interface definitions */
|
/* +++ begin MPICH/TotalView std debugger interface definitions */
|
||||||
|
|
||||||
#define MPIR_DEBUG_SPAWNED 1
|
|
||||||
#define MPIR_DEBUG_ABORTING 2
|
|
||||||
|
|
||||||
|
struct MPIR_PROCDESC {
|
||||||
|
char *host_name; /* something that can be passed to inet_addr */
|
||||||
|
char *executable_name; /* name of binary */
|
||||||
|
int pid; /* process pid */
|
||||||
|
};
|
||||||
|
|
||||||
struct MPIR_PROCDESC *MPIR_proctable = NULL;
|
struct MPIR_PROCDESC *MPIR_proctable = NULL;
|
||||||
int MPIR_proctable_size = 0;
|
int MPIR_proctable_size = 0;
|
||||||
int MPIR_being_debugged = 0;
|
bool MPIR_being_debugged = false;
|
||||||
int MPIR_force_to_main = 0;
|
|
||||||
volatile int MPIR_debug_state = 0;
|
volatile int MPIR_debug_state = 0;
|
||||||
volatile int MPIR_i_am_starter = 0;
|
volatile int MPIR_i_am_starter = 0;
|
||||||
volatile int MPIR_acquired_pre_main = 0;
|
volatile int MPIR_partial_attach_ok = 1;
|
||||||
|
|
||||||
/* --- end MPICH/TotalView interface definitions */
|
/* --- end MPICH/TotalView std debugger interface definitions */
|
||||||
|
|
||||||
|
|
||||||
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
|
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
|
||||||
@ -94,7 +91,7 @@ static void dump(void)
|
|||||||
|
|
||||||
DUMP_INT(MPIR_being_debugged);
|
DUMP_INT(MPIR_being_debugged);
|
||||||
DUMP_INT(MPIR_debug_state);
|
DUMP_INT(MPIR_debug_state);
|
||||||
DUMP_INT(MPIR_acquired_pre_main);
|
DUMP_INT(MPIR_partial_attach_ok);
|
||||||
DUMP_INT(MPIR_i_am_starter);
|
DUMP_INT(MPIR_i_am_starter);
|
||||||
DUMP_INT(MPIR_proctable_size);
|
DUMP_INT(MPIR_proctable_size);
|
||||||
fprintf(stderr, " MPIR_proctable:\n");
|
fprintf(stderr, " MPIR_proctable:\n");
|
||||||
@ -365,31 +362,29 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
|||||||
* spawn we need to check if we are being run under a TotalView-like
|
* spawn we need to check if we are being run under a TotalView-like
|
||||||
* debugger; if so then inform applications via an MCA parameter.
|
* debugger; if so then inform applications via an MCA parameter.
|
||||||
*/
|
*/
|
||||||
void orte_totalview_init_before_spawn(void)
|
void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||||
{
|
{
|
||||||
if (MPIR_DEBUG_SPAWNED == MPIR_being_debugged) {
|
char *s;
|
||||||
|
orte_app_context_t **apps;
|
||||||
|
orte_std_cntr_t i;
|
||||||
|
|
||||||
int value;
|
if (!MPIR_being_debugged) {
|
||||||
char *s;
|
/* not being debugged */
|
||||||
|
return;
|
||||||
if (orte_debug_flag) {
|
|
||||||
opal_output(0, "Info: Spawned by a debugger");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mca_base_param_reg_int_name("ompi", "mpi_wait_for_totalview",
|
|
||||||
"Whether the MPI application should wait for a debugger or not",
|
|
||||||
false, false, (int)false, &value) < 0) {
|
|
||||||
opal_output(0, "Error: mca_base_param_reg_int_name\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* push mca parameter into the environment (not done automatically?) */
|
|
||||||
|
|
||||||
s = mca_base_param_environ_variable("ompi", "mpi_wait_for_totalview", NULL);
|
|
||||||
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
|
|
||||||
opal_output(0, "Error: Can't setenv %s\n", s);
|
|
||||||
}
|
|
||||||
free(s);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (orte_debug_flag) {
|
||||||
|
opal_output(0, "Info: Spawned by a debugger");
|
||||||
|
}
|
||||||
|
|
||||||
|
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||||
|
/* tell the procs they are being debugged */
|
||||||
|
s = mca_base_param_environ_variable("ompi", "mpi_being_debugged", NULL);
|
||||||
|
|
||||||
|
for (i=0; i < jdata->num_apps; i++) {
|
||||||
|
opal_setenv(s, "1", true, &apps[i]->env);
|
||||||
|
}
|
||||||
|
free(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -401,80 +396,65 @@ void orte_totalview_init_before_spawn(void)
|
|||||||
*
|
*
|
||||||
* @param jobid The jobid returned by spawn.
|
* @param jobid The jobid returned by spawn.
|
||||||
*/
|
*/
|
||||||
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
|
void orte_debugger_init_after_spawn(orte_job_t *jdata)
|
||||||
{
|
{
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_proc_t **procs;
|
orte_proc_t **procs;
|
||||||
orte_app_context_t *appctx, **apps;
|
orte_app_context_t *appctx, **apps;
|
||||||
orte_vpid_t i, j;
|
orte_vpid_t i, j;
|
||||||
|
opal_buffer_t buf;
|
||||||
|
orte_process_name_t rank0;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (!MPIR_being_debugged) {
|
||||||
|
/* not being debugged */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (MPIR_proctable) {
|
if (MPIR_proctable) {
|
||||||
/* already initialized */
|
/* already initialized */
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (0) { /* debugging daemons <<-- needs work */
|
/* fill in the proc table for the application processes */
|
||||||
|
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
opal_output(0, "Info: Setting up debugger process table for daemons\n");
|
opal_output(0, "Info: Setting up debugger process table for applications\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
MPIR_debug_state = 1;
|
||||||
|
|
||||||
/*
|
/* set the total number of processes in the job */
|
||||||
* Debugging applications or not being debugged.
|
MPIR_proctable_size = jdata->num_procs;
|
||||||
*
|
|
||||||
* Either way, fill in the proc table for the application
|
/* allocate MPIR_proctable */
|
||||||
* processes in case someone attaches later.
|
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
|
||||||
*/
|
MPIR_proctable_size);
|
||||||
|
if (MPIR_proctable == NULL) {
|
||||||
if (orte_debug_flag) {
|
opal_output(0, "Error: Out of memory\n");
|
||||||
opal_output(0, "Info: Setting up debugger process table for applications\n");
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPIR_debug_state = 1;
|
/* initialize MPIR_proctable */
|
||||||
|
i=0;
|
||||||
/* Get the job data for this job */
|
procs = (orte_proc_t**)jdata->procs->addr;
|
||||||
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
|
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||||
opal_output(0, "Error: Can't get job data\n");
|
for (j=0; j < jdata->num_procs; j++) {
|
||||||
return;
|
if (NULL == procs[j]) {
|
||||||
|
opal_output(0, "Error: undefined proc at position %ld\n", (long)j);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* set the total number of processes in the job */
|
appctx = apps[procs[j]->app_idx];
|
||||||
|
|
||||||
MPIR_proctable_size = jdata->num_procs;
|
MPIR_proctable[i].host_name = strdup(procs[j]->node->name);
|
||||||
|
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
|
||||||
/* allocate MPIR_proctable */
|
MPIR_proctable[i].executable_name =
|
||||||
|
opal_os_path( false, appctx->app, NULL );
|
||||||
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
|
} else {
|
||||||
MPIR_proctable_size);
|
MPIR_proctable[i].executable_name =
|
||||||
if (MPIR_proctable == NULL) {
|
opal_os_path( false, appctx->cwd, appctx->app, NULL );
|
||||||
opal_output(0, "Error: Out of memory\n");
|
}
|
||||||
}
|
MPIR_proctable[i].pid = procs[j]->pid;
|
||||||
|
i++;
|
||||||
/* initialize MPIR_proctable */
|
|
||||||
|
|
||||||
i=0;
|
|
||||||
procs = (orte_proc_t**)jdata->procs->addr;
|
|
||||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
|
||||||
for (j=0; j < jdata->num_procs; j++) {
|
|
||||||
if (NULL == procs[j]) {
|
|
||||||
opal_output(0, "Error: undefined proc at position %ld\n", (long)j);
|
|
||||||
}
|
|
||||||
|
|
||||||
appctx = apps[procs[j]->app_idx];
|
|
||||||
|
|
||||||
MPIR_proctable[i].host_name = strdup(procs[j]->node->name);
|
|
||||||
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
|
|
||||||
MPIR_proctable[i].executable_name =
|
|
||||||
opal_os_path( false, appctx->app, NULL );
|
|
||||||
} else {
|
|
||||||
MPIR_proctable[i].executable_name =
|
|
||||||
opal_os_path( false, appctx->cwd, appctx->app, NULL );
|
|
||||||
}
|
|
||||||
MPIR_proctable[i].pid = procs[j]->pid;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
@ -482,6 +462,15 @@ void orte_totalview_init_after_spawn(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
|
|
||||||
(void) MPIR_Breakpoint();
|
(void) MPIR_Breakpoint();
|
||||||
|
|
||||||
|
/* send a message to rank=0 to release it */
|
||||||
|
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
|
||||||
|
rank0.jobid = jdata->jobid;
|
||||||
|
rank0.vpid = 0;
|
||||||
|
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
|
||||||
|
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -489,10 +478,11 @@ void orte_totalview_init_after_spawn(orte_jobid_t jobid)
|
|||||||
* Release resources associated with data structures for running under
|
* Release resources associated with data structures for running under
|
||||||
* a debugger using the MPICH/TotalView parallel debugger interface.
|
* a debugger using the MPICH/TotalView parallel debugger interface.
|
||||||
*/
|
*/
|
||||||
void orte_totalview_finalize(void)
|
void orte_debugger_finalize(void)
|
||||||
{
|
{
|
||||||
if (MPIR_proctable) {
|
if (MPIR_proctable) {
|
||||||
free(MPIR_proctable);
|
free(MPIR_proctable);
|
||||||
|
MPIR_proctable = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -17,33 +17,23 @@
|
|||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef ORTE_TOTALVIEW_H
|
#ifndef ORTE_DEBUGGERS_H
|
||||||
#define ORTE_TOTALVIEW_H
|
#define ORTE_DEBUGGERS_H
|
||||||
|
|
||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||||
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
|
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
|
||||||
ORTE_DECLSPEC void orte_totalview_init_before_spawn(void);
|
void orte_debugger_init_before_spawn(orte_job_t *jdata);
|
||||||
ORTE_DECLSPEC void orte_totalview_init_after_spawn(orte_jobid_t jobid);
|
void orte_debugger_init_after_spawn(orte_job_t *jdata);
|
||||||
ORTE_DECLSPEC void orte_totalview_finalize(void);
|
void orte_debugger_finalize(void);
|
||||||
|
|
||||||
ORTE_DECLSPEC extern void *MPIR_Breakpoint(void);
|
extern void *MPIR_Breakpoint(void);
|
||||||
|
|
||||||
struct MPIR_PROCDESC {
|
|
||||||
char *host_name; /* something that can be passed to inet_addr */
|
|
||||||
char *executable_name; /* name of binary */
|
|
||||||
int pid; /* process pid */
|
|
||||||
};
|
|
||||||
|
|
||||||
ORTE_DECLSPEC extern struct MPIR_PROCDESC *MPIR_proctable;
|
|
||||||
ORTE_DECLSPEC extern int MPIR_proctable_size;
|
|
||||||
ORTE_DECLSPEC extern int MPIR_being_debugged;
|
|
||||||
ORTE_DECLSPEC extern volatile int MPIR_debug_state;
|
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif /* ORTE_TOTALVIEW_H */
|
#endif /* ORTE_DEBUGGERS_H */
|
@ -71,7 +71,6 @@
|
|||||||
#include "orte/util/pre_condition_transports.h"
|
#include "orte/util/pre_condition_transports.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/totalview.h"
|
|
||||||
|
|
||||||
#include "orte/mca/odls/odls.h"
|
#include "orte/mca/odls/odls.h"
|
||||||
#include "orte/mca/plm/plm.h"
|
#include "orte/mca/plm/plm.h"
|
||||||
@ -90,6 +89,7 @@
|
|||||||
/* ensure I can behave like a daemon */
|
/* ensure I can behave like a daemon */
|
||||||
#include "orte/orted/orted.h"
|
#include "orte/orted/orted.h"
|
||||||
|
|
||||||
|
#include "debuggers.h"
|
||||||
#include "orterun.h"
|
#include "orterun.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -524,7 +524,9 @@ int orterun(int argc, char *argv[])
|
|||||||
signal_forward_callback, &sigusr2_handler);
|
signal_forward_callback, &sigusr2_handler);
|
||||||
opal_signal_add(&sigusr2_handler, NULL);
|
opal_signal_add(&sigusr2_handler, NULL);
|
||||||
#endif /* __WINDOWS__ */
|
#endif /* __WINDOWS__ */
|
||||||
orte_totalview_init_before_spawn();
|
|
||||||
|
/* setup for debugging, if we are doing so */
|
||||||
|
orte_debugger_init_before_spawn(jdata);
|
||||||
|
|
||||||
/* setup an event we can wait for that will tell
|
/* setup an event we can wait for that will tell
|
||||||
* us to terminate - both normal and abnormal
|
* us to terminate - both normal and abnormal
|
||||||
@ -542,6 +544,9 @@ int orterun(int argc, char *argv[])
|
|||||||
/* Spawn the job */
|
/* Spawn the job */
|
||||||
rc = orte_plm.spawn(jdata);
|
rc = orte_plm.spawn(jdata);
|
||||||
|
|
||||||
|
/* complete debugger interface, if we are debugging */
|
||||||
|
orte_debugger_init_after_spawn(jdata);
|
||||||
|
|
||||||
/* now wait until the termination event fires */
|
/* now wait until the termination event fires */
|
||||||
opal_event_dispatch();
|
opal_event_dispatch();
|
||||||
|
|
||||||
@ -604,7 +609,7 @@ static void job_completed(int trigpipe, short event, void *arg)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* if the debuggers were run, clean up */
|
/* if the debuggers were run, clean up */
|
||||||
orte_totalview_finalize();
|
orte_debugger_finalize();
|
||||||
|
|
||||||
/* the job is complete - now setup an event that will
|
/* the job is complete - now setup an event that will
|
||||||
* trigger when the orteds are gone and tell the orteds that it is
|
* trigger when the orteds are gone and tell the orteds that it is
|
||||||
|
@ -47,8 +47,7 @@ headers += \
|
|||||||
util/hostfile/hostfile_lex.h \
|
util/hostfile/hostfile_lex.h \
|
||||||
util/dash_host/dash_host.h \
|
util/dash_host/dash_host.h \
|
||||||
util/comm/comm.h \
|
util/comm/comm.h \
|
||||||
util/nidmap.h \
|
util/nidmap.h
|
||||||
util/totalview.h
|
|
||||||
|
|
||||||
libopen_rte_la_SOURCES += \
|
libopen_rte_la_SOURCES += \
|
||||||
util/context_fns.c \
|
util/context_fns.c \
|
||||||
@ -58,6 +57,5 @@ libopen_rte_la_SOURCES += \
|
|||||||
util/hostfile/hostfile.c \
|
util/hostfile/hostfile.c \
|
||||||
util/dash_host/dash_host.c \
|
util/dash_host/dash_host.c \
|
||||||
util/comm/comm.c \
|
util/comm/comm.c \
|
||||||
util/nidmap.c \
|
util/nidmap.c
|
||||||
util/totalview.c
|
|
||||||
endif
|
endif
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user